!python --version
Python 3.12.4
!pip3 install --upgrade Pillow
!pip3 install pandas
!pip3 install seaborn
!pip3 install nltk
!pip3 install contractions
!pip3 install matplotlib
!pip3 install tensorflow
!pip3 install wordcloud
!pip3 install sklearn
!pip3 install imblearn
!pip3 install xgboost
!pip3 install contractions
!pip3 install -U threadpoolctl
!pip3 install imblearn
!pip3 install scikit-learn==1.2.2
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import time
import tensorflow as tf
import matplotlib
import sklearn
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dropout
from keras import models, layers
import nltk
import warnings
warnings.filterwarnings("ignore")
2024-10-30 01:50:53.399630: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
print("Pandas version : ", pd.__version__)
print("Numpy version : ", np.__version__)
print("Seaborn version : ", sns.__version__)
print("Matplotlib version : ", matplotlib.__version__)
print("Tensorflow version : ", tf.__version__)
print("Scikit learn version : ", sklearn.__version__)
print("NLTK version is: ", nltk.__version__)
Pandas version : 2.0.3 Numpy version : 1.24.3 Seaborn version : 0.13.2 Matplotlib version : 3.7.2 Tensorflow version : 2.16.1 Scikit learn version : 1.3.0 NLTK version is: 3.8.1
url = "https://raw.githubusercontent.com/ThavaseelanMohan/Machine_Learning/main/tweet_global_warming.csv"
filename = "tweet_global_warming.csv"
df_climate = pd.read_csv(url, encoding = 'unicode_escape')
df_climate
| username | date | tweet | existence | existence.confidence | |
|---|---|---|---|---|---|
| 0 | EthicalSkeptic | 16/06/2023 17:38 | Global warming report urges governments to act... | Yes | 1.0000 |
| 1 | ElonMuskAOC | 16/06/2023 17:07 | Fighting poverty and global warming in Africa ... | Yes | 1.0000 |
| 2 | swolecialism | 16/06/2023 17:07 | Carbon offsets: How a Vatican forest failed to... | Yes | 0.8786 |
| 3 | domesticetch | 16/06/2023 16:56 | Carbon offsets: How a Vatican forest failed to... | Yes | 1.0000 |
| 4 | ArbDogeAI | 16/06/2023 14:50 | URUGUAY: Tools Needed for Those Most Vulnerabl... | Yes | 0.8087 |
| ... | ... | ... | ... | ... | ... |
| 6088 | Rozenity | 07/01/2023 18:12 | @bloodless_coup "The phrase 'global warming' s... | Y | 1.0000 |
| 6089 | KariDru | 07/01/2023 17:47 | Virginia to Investigate Global Warming Scienti... | NaN | 1.0000 |
| 6090 | regohurtado | 07/01/2023 17:37 | Global warming you tube parody you will enjoy ... | N | 0.6411 |
| 6091 | icreatelife | 07/01/2023 17:30 | One-Eyed Golfer: Don't dare tell me about glob... | N | 1.0000 |
| 6092 | mckaywrigley | 07/01/2023 17:23 | man made global warming a hair brained theory ... | N | 1.0000 |
6093 rows × 5 columns
df_climate.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6093 entries, 0 to 6092 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 username 6093 non-null object 1 date 6093 non-null object 2 tweet 6090 non-null object 3 existence 4228 non-null object 4 existence.confidence 6090 non-null float64 dtypes: float64(1), object(4) memory usage: 238.1+ KB
df_climate.describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| username | 6093 | 4203 | icreatelife | 29 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| date | 6093 | 5497 | 30/12/2022 02:11 | 14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| tweet | 6090 | 5538 | No matter if you believe in global warming or ... | 20 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| existence | 4228 | 6 | Y | 2553 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| existence.confidence | 6090.0 | NaN | NaN | NaN | 0.794738 | 0.181711 | 0.0 | 0.6585 | 0.70765 | 1.0 | 2.0 |
# convert the date column into a datetime object
df_climate['date'] = pd.to_datetime(df_climate['date'])
# extract the day, month, and year components
df_climate['day'] = df_climate['date'].dt.day
df_climate['month'] = df_climate['date'].dt.month
df_climate['year'] = df_climate['date'].dt.year
df_climate['year'].value_counts()
year 2023 5799 2022 294 Name: count, dtype: int64
tweets_2023 = df_climate[df_climate['year']==2023]
tweets_2023.head(5)
| username | date | tweet | existence | existence.confidence | day | month | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | EthicalSkeptic | 2023-06-16 17:38:00 | Global warming report urges governments to act... | Yes | 1.0000 | 16 | 6 | 2023 |
| 1 | ElonMuskAOC | 2023-06-16 17:07:00 | Fighting poverty and global warming in Africa ... | Yes | 1.0000 | 16 | 6 | 2023 |
| 2 | swolecialism | 2023-06-16 17:07:00 | Carbon offsets: How a Vatican forest failed to... | Yes | 0.8786 | 16 | 6 | 2023 |
| 3 | domesticetch | 2023-06-16 16:56:00 | Carbon offsets: How a Vatican forest failed to... | Yes | 1.0000 | 16 | 6 | 2023 |
| 4 | ArbDogeAI | 2023-06-16 14:50:00 | URUGUAY: Tools Needed for Those Most Vulnerabl... | Yes | 0.8087 | 16 | 6 | 2023 |
count_tweets = tweets_2023['month'].value_counts()
print(count_tweets)
color = ['lawngreen', 'fuchsia','lightsalmon','bisque', 'orange']
tweets_2023['month'].value_counts().sort_values().plot(kind='barh', title='Tweets Contribution by Month', stacked=True, color=color)
month 5 2326 4 1073 1 1015 6 789 3 596 Name: count, dtype: int64
<Axes: title={'center': 'Tweets Contribution by Month'}, ylabel='month'>
df_climate['username'].value_counts()
username
icreatelife 29
JeffLadish 24
ESYudkowsky 20
docsquiddy 19
ProfNoahGian 18
..
vibraslapathon 1
Nokathe1st 1
Dyaticedm 1
loki_monster 1
username 1
Name: count, Length: 4203, dtype: int64
df_climate.loc[df_climate['username'] == 'icreatelife'].reset_index(drop = True)
| username | date | tweet | existence | existence.confidence | day | month | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | icreatelife | 2023-06-13 17:56:00 | Graham's exit from talks puts climate change b... | NaN | 0.8845 | 13 | 6 | 2023 |
| 1 | icreatelife | 2023-06-06 02:15:00 | Government Report Says Global Warming May Caus... | Yes | 0.5864 | 6 | 6 | 2023 |
| 2 | icreatelife | 2023-06-02 21:21:00 | This one explained the extreme cold weather we... | Yes | 0.5922 | 2 | 6 | 2023 |
| 3 | icreatelife | 2023-06-02 13:14:00 | the scientific community was scamed by global ... | No | 1.0000 | 2 | 6 | 2023 |
| 4 | icreatelife | 2023-06-01 23:57:00 | I am freezing still in Southern California. Gl... | No | 1.0000 | 1 | 6 | 2023 |
| 5 | icreatelife | 2023-06-01 12:56:00 | At new school climate change panel, heritage's... | NaN | 0.8278 | 1 | 6 | 2023 |
| 6 | icreatelife | 2023-06-01 02:32:00 | CNN: Graham's exit puts climate change bill in... | NaN | 0.5705 | 1 | 6 | 2023 |
| 7 | icreatelife | 2023-05-29 15:08:00 | Iceland Volcano Vs. Alternative Energy and Glo... | NaN | 0.8085 | 29 | 5 | 2023 |
| 8 | icreatelife | 2023-05-27 13:10:00 | China: The Key To Fixing Global Warming|With r... | Y | 0.7058 | 27 | 5 | 2023 |
| 9 | icreatelife | 2023-05-27 12:31:00 | Now on PBS: Going Green New York: Examining ho... | Y | 0.6421 | 27 | 5 | 2023 |
| 10 | icreatelife | 2023-05-19 13:16:00 | Immigration Reform and Climate Change: A Tale ... | NaN | 1.0000 | 19 | 5 | 2023 |
| 11 | icreatelife | 2023-05-18 14:45:00 | Brilliant Tips For Valentine's Day : A Screami... | NaN | 0.7230 | 18 | 5 | 2023 |
| 12 | icreatelife | 2023-05-14 15:39:00 | C3: New Mexico's Democrats/Liberals Push Globa... | N | 0.6969 | 14 | 5 | 2023 |
| 13 | icreatelife | 2023-05-10 21:54:00 | @glennbeck goldman ball sack, sally freddie an... | NaN | 0.6613 | 10 | 5 | 2023 |
| 14 | icreatelife | 2023-05-10 13:35:00 | DK Matai: Are Global Warming, Volcanoes and Ea... | Y | 1.0000 | 10 | 5 | 2023 |
| 15 | icreatelife | 2023-05-06 23:33:00 | This weekend huge ice block from the Hualcan g... | N | 0.6933 | 6 | 5 | 2023 |
| 16 | icreatelife | 2023-05-03 23:41:00 | Guest column, Elliott Denniston: What can we d... | Y | 0.6416 | 3 | 5 | 2023 |
| 17 | icreatelife | 2023-05-02 08:51:00 | YID With LID: The IPCC's Latest Climate Change... | N | 1.0000 | 2 | 5 | 2023 |
| 18 | icreatelife | 2023-05-01 18:02:00 | Is Global Warming a "Crock of S*%t?" http://ww... | NaN | 1.0000 | 1 | 5 | 2023 |
| 19 | icreatelife | 2023-05-01 03:40:00 | How much $ have we saved due to todays snow st... | N | 0.4908 | 1 | 5 | 2023 |
| 20 | icreatelife | 2023-04-27 22:06:00 | I think I understand why climate change causes... | N | 0.6391 | 27 | 4 | 2023 |
| 21 | icreatelife | 2023-03-31 05:15:00 | @aiki14 Global Warming? Earthquake Magnitude 4... | Y | 0.6547 | 31 | 3 | 2023 |
| 22 | icreatelife | 2023-01-07 15:19:00 | @Scratch5151Morning. Too bad about the cold an... | N | 0.6507 | 7 | 1 | 2023 |
| 23 | icreatelife | 2023-05-15 18:58:00 | Global warming conference set Sunday http://bi... | NaN | 0.7237 | 15 | 5 | 2023 |
| 24 | icreatelife | 2023-05-15 19:10:00 | ROTHBARD & RUCKER: Global warming's weak links... | NaN | 0.6857 | 15 | 5 | 2023 |
| 25 | icreatelife | 2022-12-30 12:03:00 | I wonder how much Iceland Volcano will contrib... | Y | 0.6307 | 30 | 12 | 2022 |
| 26 | icreatelife | 2023-01-07 23:59:00 | Explorer to discuss global warming: Renowned p... | Y | 0.7140 | 7 | 1 | 2023 |
| 27 | icreatelife | 2023-01-07 17:47:00 | THE best climate change economics primer. - ja... | Y | 0.6940 | 7 | 1 | 2023 |
| 28 | icreatelife | 2023-01-07 17:30:00 | One-Eyed Golfer: Don't dare tell me about glob... | N | 1.0000 | 7 | 1 | 2023 |
df_climate.loc[df_climate['username'] == 'JeffLadish'].reset_index(drop = True)
| username | date | tweet | existence | existence.confidence | day | month | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | JeffLadish | 2023-06-14 17:16:00 | QUT researchers track climate change [link] | Yes | 0.5194 | 14 | 6 | 2023 |
| 1 | JeffLadish | 2023-06-14 17:12:00 | Global Warming: Ocean chemistry is changing fa... | Yes | 1.0000 | 14 | 6 | 2023 |
| 2 | JeffLadish | 2023-05-24 22:34:00 | Spring storm season starting a little late thi... | N | 0.6502 | 24 | 5 | 2023 |
| 3 | JeffLadish | 2023-05-23 23:55:00 | Oxfam: Climate change devastating rural Ethiop... | Y | 1.0000 | 23 | 5 | 2023 |
| 4 | JeffLadish | 2023-05-20 00:42:00 | @junerenner ,get a headache when grandson tell... | N | 0.6884 | 20 | 5 | 2023 |
| 5 | JeffLadish | 2023-04-29 15:39:00 | RT @TreeHugger: Worth reminding people: Al Gor... | Y | 0.6640 | 29 | 4 | 2023 |
| 6 | JeffLadish | 2023-04-24 22:00:00 | Right-wingers already claiming that the blizza... | Y | 0.7014 | 24 | 4 | 2023 |
| 7 | JeffLadish | 2023-04-24 20:49:00 | How Global Warming Makes Blizzards Worse http:... | Y | 1.0000 | 24 | 4 | 2023 |
| 8 | JeffLadish | 2023-04-02 12:17:00 | Global warming??? ;P global warming http://bit... | N | 0.5901 | 2 | 4 | 2023 |
| 9 | JeffLadish | 2023-03-31 15:52:00 | "Fox News has Al Gore's book on global warming... | Y | 0.6805 | 31 | 3 | 2023 |
| 10 | JeffLadish | 2023-03-31 15:38:00 | Video: MS-NBC's Brewer claims heavy snowfall p... | Y | 1.0000 | 31 | 3 | 2023 |
| 11 | JeffLadish | 2023-03-31 15:28:00 | Rt @InvasiveNotes RT @OceanChampions: No, snow... | Y | 1.0000 | 31 | 3 | 2023 |
| 12 | JeffLadish | 2023-03-31 15:16:00 | The Reference Frame: Global warming causes sno... | Y | 1.0000 | 31 | 3 | 2023 |
| 13 | JeffLadish | 2023-03-27 19:36:00 | @plutoniumpage Yeah that must be why every run... | NaN | 0.6642 | 27 | 3 | 2023 |
| 14 | JeffLadish | 2023-03-26 21:04:00 | @joshtyler pot causes global warming? who ever... | N | 0.6866 | 26 | 3 | 2023 |
| 15 | JeffLadish | 2023-01-23 12:44:00 | Free Reading !!! Ahead of my time vis a vis gl... | N | 0.5124 | 23 | 1 | 2023 |
| 16 | JeffLadish | 2023-01-20 00:23:00 | Obama spending our Tax money 4 New Federal Cli... | N | 0.6939 | 20 | 1 | 2023 |
| 17 | JeffLadish | 2023-01-16 17:13:00 | From The New Album Mama Roots Global Warming h... | NaN | 1.0000 | 16 | 1 | 2023 |
| 18 | JeffLadish | 2023-01-11 22:57:00 | Predicting future climate change through sedim... | Y | 1.0000 | 11 | 1 | 2023 |
| 19 | JeffLadish | 2023-01-11 22:43:00 | We take climate change seriously & as a busine... | Y | 1.0000 | 11 | 1 | 2023 |
| 20 | JeffLadish | 2023-01-11 21:37:00 | How Climate-Change Fanatics Corrupted Science ... | N | 1.0000 | 11 | 1 | 2023 |
| 21 | JeffLadish | 2023-01-09 16:16:00 | Letters: Day for the climate change question |... | NaN | 0.6568 | 9 | 1 | 2023 |
| 22 | JeffLadish | 2023-01-08 13:00:00 | Putting people first ' Psychology, climate cha... | Y | 0.6554 | 8 | 1 | 2023 |
| 23 | JeffLadish | 2023-01-08 11:45:00 | Global Warming Causes Earthquakes. Suuuuure it... | N | 1.0000 | 8 | 1 | 2023 |
df_climate = df_climate.drop(columns=['username','date','day','month','year'])
df_climate.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6093 entries, 0 to 6092 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet 6090 non-null object 1 existence 4228 non-null object 2 existence.confidence 6090 non-null float64 dtypes: float64(1), object(2) memory usage: 142.9+ KB
print('Number of duplicated rows: ' , len(df_climate[df_climate.duplicated()]))
Number of duplicated rows: 130
df_climate = df_climate.drop_duplicates().reset_index(drop=True)
print('Number of duplicated rows: ' , len(df_climate[df_climate.duplicated()]))
Number of duplicated rows: 0
print('Number of duplicated rows in tweets: ' , len(df_climate[df_climate.duplicated(subset=['tweet'])]))
Number of duplicated rows in tweets: 424
df_climate = df_climate.drop_duplicates(subset=['tweet'], keep='first').reset_index(drop=True)
print('Number of duplicated rows in tweets: ' , len(df_climate[df_climate.duplicated(subset=['tweet'])]))
Number of duplicated rows in tweets: 0
df_climate.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5539 entries, 0 to 5538 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet 5538 non-null object 1 existence 3857 non-null object 2 existence.confidence 5536 non-null float64 dtypes: float64(1), object(2) memory usage: 129.9+ KB
df_climate = df_climate.drop(columns=['existence','existence.confidence'])
df_climate.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5539 entries, 0 to 5538 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet 5538 non-null object dtypes: object(1) memory usage: 43.4+ KB
df_climate.isna().sum()
tweet 1 dtype: int64
plt.figure(figsize=(20,4))
sns.heatmap((df_climate.isnull().sum()).to_frame(name='').T,cmap='Blues', annot=True,
fmt='0.0f').set_title('Count of Missing Values', fontsize=20)
plt.show()
df_climate = df_climate.dropna().reset_index(drop=True)
df_climate.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5538 entries, 0 to 5537 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet 5538 non-null object dtypes: object(1) memory usage: 43.4+ KB
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()
print(sentiment.polarity_scores("Climate change & sustainability will be a key driver of future economic development."))
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
[nltk_data] Downloading package vader_lexicon to [nltk_data] /Users/thavaseelan/nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
df_climate['compound'] = [sentiment.polarity_scores(scores)['compound'] for scores in df_climate['tweet'].astype(str)]
df_climate['negative'] = [sentiment.polarity_scores(scores)['neg'] for scores in df_climate['tweet'].astype(str)]
df_climate['neutral'] = [sentiment.polarity_scores(scores)['neu'] for scores in df_climate['tweet'].astype(str)]
df_climate['positive'] = [sentiment.polarity_scores(scores)['pos'] for scores in df_climate['tweet'].astype(str)]
df_climate[['compound','negative','neutral','positive']].describe()
| compound | negative | neutral | positive | |
|---|---|---|---|---|
| count | 5538.000000 | 5538.000000 | 5538.000000 | 5538.000000 |
| mean | 0.046906 | 0.074098 | 0.813387 | 0.112517 |
| std | 0.360251 | 0.108085 | 0.152094 | 0.115992 |
| min | -0.950700 | 0.000000 | 0.105000 | 0.000000 |
| 25% | -0.153100 | 0.000000 | 0.714000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 0.825000 | 0.094000 |
| 75% | 0.273200 | 0.140000 | 0.929000 | 0.176000 |
| max | 0.967700 | 0.623000 | 1.000000 | 0.683000 |
df_climate.loc[df_climate['positive'] > 0.5].reset_index(drop=True)
| tweet | compound | negative | neutral | positive | |
|---|---|---|---|---|---|
| 0 | @SooperMexican Global warming. Clearly. | 0.5106 | 0.000 | 0.317 | 0.683 |
| 1 | Great article [link] | 0.6249 | 0.000 | 0.328 | 0.672 |
| 2 | Safe, secure nuclear energy must be part of cl... | 0.8271 | 0.000 | 0.452 | 0.548 |
| 3 | COW FARTS EXONERATED FROM GLOBAL WARMING [link] | 0.7065 | 0.000 | 0.460 | 0.540 |
| 4 | Safe, secure nuclear energy must be part of cl... | 0.8271 | 0.000 | 0.452 | 0.548 |
| 5 | @deannatdon HAHAH LOL global warming! | 0.6625 | 0.000 | 0.356 | 0.644 |
| 6 | http://www.documentarywire.com/great-global-wa... | 0.3182 | 0.306 | 0.180 | 0.514 |
| 7 | Global Warming Wont Melt Ignorance stake :) | 0.6917 | 0.000 | 0.373 | 0.627 |
| 8 | globalwarming Global warming??? | 0.2824 | 0.000 | 0.483 | 0.517 |
| 9 | energy absorption Innovation has climate cha... | 0.7845 | 0.000 | 0.470 | 0.530 |
| 10 | The Great Thing About Global Warming http://dl... | 0.6908 | 0.000 | 0.467 | 0.533 |
| 11 | A lot of NJ's blocked me, but you're fine :) R... | 0.9590 | 0.077 | 0.409 | 0.514 |
| 12 | The great global warming collapse'... http://b... | 0.6908 | 0.000 | 0.412 | 0.588 |
| 13 | @TxJogger12 LOL, yeah I really want some globa... | 0.8961 | 0.000 | 0.336 | 0.664 |
| 14 | @jmac82 So much for global warming! :-P | 0.6280 | 0.000 | 0.494 | 0.506 |
| 15 | @LamontKingtv LOL I'm just making a funny. But... | 0.8679 | 0.000 | 0.426 | 0.574 |
| 16 | Man I love global warming. #fb | 0.7003 | 0.000 | 0.341 | 0.659 |
| 17 | @Gemstars LOL i was just being silly 4 the sak... | 0.9677 | 0.000 | 0.401 | 0.599 |
| 18 | good luck What can our families do to help sto... | 0.8360 | 0.103 | 0.329 | 0.568 |
| 19 | Save the Ozone Layer, Give Global Warming a Bo... | 0.8924 | 0.000 | 0.487 | 0.513 |
| 20 | @LaurenConrad Oh. Well, there goes the global... | 0.7378 | 0.000 | 0.492 | 0.508 |
| 21 | Global warming??? . globalwarming | 0.2824 | 0.000 | 0.483 | 0.517 |
| 22 | :P What can our families do to help stop globa... | 0.8788 | 0.098 | 0.311 | 0.592 |
| 23 | energy absorption Innovation has climate chan... | 0.7845 | 0.000 | 0.470 | 0.530 |
| 24 | FREAKING global warming. HA! | 0.1260 | 0.370 | 0.105 | 0.526 |
| 25 | :P http://bit.ly/bgxiU6 stop global warming Th... | 0.6166 | 0.272 | 0.202 | 0.526 |
| 26 | @thatsright_95 http://twitpic.com/11n0ql - oh ... | 0.8065 | 0.000 | 0.491 | 0.509 |
| 27 | Positive Proof of Global Warming! [PIC] http:... | 0.6696 | 0.000 | 0.477 | 0.523 |
| 28 | The Global Climate: Yes, Its Still Warming :) ... | 0.7430 | 0.000 | 0.490 | 0.510 |
| 29 | ;P http://bit.ly/dAuSjA global warming solutio... | 0.7964 | 0.000 | 0.435 | 0.565 |
| 30 | Attractive Nuisance Can federal courts help ta... | 0.7351 | 0.000 | 0.493 | 0.507 |
sns.histplot(df_climate['compound'])
<Axes: xlabel='compound', ylabel='Count'>
sns.histplot(df_climate['negative'])
<Axes: xlabel='negative', ylabel='Count'>
sns.histplot(df_climate['neutral'])
<Axes: xlabel='neutral', ylabel='Count'>
sns.histplot(df_climate['positive'])
<Axes: xlabel='positive', ylabel='Count'>
(df_climate['compound']<=0).sum()
2985
(df_climate['negative']<=0).sum()
3359
(df_climate['neutral']<=0).sum()
0
(df_climate['positive']<=0).sum()
1999
df_climate[df_climate['compound']<0].reset_index(drop=True)
| tweet | compound | negative | neutral | positive | |
|---|---|---|---|---|---|
| 0 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 |
| 1 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 |
| 2 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 |
| 3 | Global warming evidence all around us|A messag... | -0.2960 | 0.195 | 0.664 | 0.142 |
| 4 | Climate change blamed as coastal whale migrati... | -0.4767 | 0.220 | 0.780 | 0.000 |
| ... | ... | ... | ... | ... | ... |
| 1635 | Bats, birds, and lizards can fight climate cha... | -0.3818 | 0.245 | 0.755 | 0.000 |
| 1636 | Is Global Warming A Hoax? - Find Answers to th... | -0.2500 | 0.179 | 0.684 | 0.137 |
| 1637 | RT @johnnyA99 'Ecocide 2 be used to prosecute ... | -0.2732 | 0.148 | 0.765 | 0.087 |
| 1638 | Global warming: The fossil fuel dilemma: Ameri... | -0.0258 | 0.076 | 0.852 | 0.072 |
| 1639 | One-Eyed Golfer: Don't dare tell me about glob... | -0.1280 | 0.101 | 0.821 | 0.077 |
1640 rows × 5 columns
df_climate[df_climate['compound'] > 0.05].reset_index(drop=True)
| tweet | compound | negative | neutral | positive | |
|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 |
| 1 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 |
| 2 | Global warming to impact wheat, rice productio... | 0.0772 | 0.065 | 0.854 | 0.080 |
| 3 | How do we solve this global warming thing? [link] | 0.3400 | 0.000 | 0.673 | 0.327 |
| 4 | Blog|A preliminary analysis suggests that natu... | 0.5209 | 0.000 | 0.786 | 0.214 |
| ... | ... | ... | ... | ... | ... |
| 2506 | It's 83â¢_à and climbing in NYC. August weat... | 0.4559 | 0.000 | 0.827 | 0.173 |
| 2507 | @bloodless_coup "The phrase 'global warming' s... | 0.0772 | 0.135 | 0.673 | 0.193 |
| 2508 | Virginia to Investigate Global Warming Scienti... | 0.1531 | 0.000 | 0.814 | 0.186 |
| 2509 | Global warming you tube parody you will enjoy ... | 0.5859 | 0.000 | 0.652 | 0.348 |
| 2510 | man made global warming a hair brained theory ... | 0.0516 | 0.064 | 0.864 | 0.073 |
2511 rows × 5 columns
df_climate[df_climate['compound'] < -0.05].reset_index(drop=True)
| tweet | compound | negative | neutral | positive | |
|---|---|---|---|---|---|
| 0 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 |
| 1 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 |
| 2 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 |
| 3 | Global warming evidence all around us|A messag... | -0.2960 | 0.195 | 0.664 | 0.142 |
| 4 | Climate change blamed as coastal whale migrati... | -0.4767 | 0.220 | 0.780 | 0.000 |
| ... | ... | ... | ... | ... | ... |
| 1606 | Ice ice baby! Get the lowdown on ice & climate... | -0.2714 | 0.099 | 0.901 | 0.000 |
| 1607 | Bats, birds, and lizards can fight climate cha... | -0.3818 | 0.245 | 0.755 | 0.000 |
| 1608 | Is Global Warming A Hoax? - Find Answers to th... | -0.2500 | 0.179 | 0.684 | 0.137 |
| 1609 | RT @johnnyA99 'Ecocide 2 be used to prosecute ... | -0.2732 | 0.148 | 0.765 | 0.087 |
| 1610 | One-Eyed Golfer: Don't dare tell me about glob... | -0.1280 | 0.101 | 0.821 | 0.077 |
1611 rows × 5 columns
df_climate['Predicted Sentiment'] = ''
def polarity_score(compound):
if compound > 0.05:
return "Positive"
elif compound < -0.05:
return "Negative"
elif compound >= -0.05 and compound < 0.05:
return "Neutral"
df_climate['Predicted Sentiment'] = df_climate['compound'].apply(lambda val: polarity_score(val))
sentiment_count = df_climate['Predicted Sentiment'].value_counts()
print(sentiment_count)
plt.figure(figsize=(5, 3))
plt.bar(sentiment_count.index, sentiment_count)
plt.title('Count Plot of AI tweets Sentiment')
plt.xlabel('Sentiments')
plt.ylabel('Count')
plt.show()
Predicted Sentiment Positive 2511 Negative 1611 Neutral 1416 Name: count, dtype: int64
#Define plot size
plt.figure(figsize=[3,3])
#Define column to use
data = df_climate["Predicted Sentiment"].value_counts(normalize=True)
#Define labels
labels = ["Positive","Negative", "Neutral"]
#Define color palette
colors = sns.color_palette('pastel')
#Create pie chart
plt.pie(data,labels=labels,colors=colors, autopct='%.0f%%')
plt.title("Pie chart visualization of Climate Change Sentiment label")
plt.show()
df_climate.shape
(5538, 6)
import re
from string import punctuation
import contractions
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stem = PorterStemmer()
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import nltk #3.8.1
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download("wordnet")
nltk.download('stopwords')
from wordcloud import WordCloud
import glob
from collections import Counter
[nltk_data] Downloading package punkt to [nltk_data] /Users/thavaseelan/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package punkt_tab to [nltk_data] /Users/thavaseelan/nltk_data... [nltk_data] Package punkt_tab is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] /Users/thavaseelan/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] /Users/thavaseelan/nltk_data... [nltk_data] Package stopwords is already up-to-date!
def clean_text_with_sw(text):
text = str(text).lower() # converting to lower
text = " ".join([contractions.fix(expanded_word) for expanded_word in text.split()]) #Removing contractions
text = re.sub('\[.*?\]', '', text) # Removing text in square brackets
text = re.sub('https?://\S+|www\.\S+', '', text) #Removing links
text = re.sub('[^a-zA-Z#]', ' ', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(punctuation), '', text) #Removing Punctuations
text = re.sub('\w*\d\w*', '', text) #Removing words containing numbers
text = re.sub('\b\w{1,3}\b', '', text) #Any words less than 3 characters are replaced by blanks
tokenized_text = word_tokenize(str(text)) # Tokenizing text
filtered_text = [word for word in tokenized_text if word not in stopwords.words('english')] #Removing stopwords
lemmatization = [lemmatizer.lemmatize(word) for word in filtered_text] #Lemmatization
lemmatization = ' '.join(lemmatization)
lemmatization = ' '.join(words for words in lemmatization.split() if len(words)>3) #Removing short words less than 3 characters
return text
# apply clean text fuction on each contents in the dataset
df_climate['Tweets_with_sw'] = df_climate['tweet'].apply(lambda x:clean_text_with_sw(x))
df_climate['word_list_with_sw'] = df_climate['Tweets_with_sw'].apply(lambda x:word_tokenize(str(x)))
df_climate.head(5)
| tweet | compound | negative | neutral | positive | Predicted Sentiment | Tweets_with_sw | word_list_with_sw | |
|---|---|---|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 | Positive | global warming report urges governments to act... | [global, warming, report, urges, governments, ... |
| 1 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 | Negative | fighting poverty and global warming in africa | [fighting, poverty, and, global, warming, in, ... |
| 2 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 | Negative | carbon offsets how a vatican forest failed to... | [carbon, offsets, how, a, vatican, forest, fai... |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 | Negative | uruguay tools needed for those most vulnerabl... | [uruguay, tools, needed, for, those, most, vul... |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 | Positive | rt sejorg rt jaymiheimbuch ocean saltiness... | [rt, sejorg, rt, jaymiheimbuch, ocean, saltine... |
df_climate['word_list_with_sw'] = df_climate['Tweets_with_sw'].apply(lambda x:word_tokenize(str(x)))
df_climate.head(5)
| tweet | compound | negative | neutral | positive | Predicted Sentiment | Tweets_with_sw | word_list_with_sw | |
|---|---|---|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 | Positive | global warming report urges governments to act... | [global, warming, report, urges, governments, ... |
| 1 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 | Negative | fighting poverty and global warming in africa | [fighting, poverty, and, global, warming, in, ... |
| 2 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 | Negative | carbon offsets how a vatican forest failed to... | [carbon, offsets, how, a, vatican, forest, fai... |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 | Negative | uruguay tools needed for those most vulnerabl... | [uruguay, tools, needed, for, those, most, vul... |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 | Positive | rt sejorg rt jaymiheimbuch ocean saltiness... | [rt, sejorg, rt, jaymiheimbuch, ocean, saltine... |
word_list = Counter([item for sublist in df_climate['word_list_with_sw'] for item in sublist])
stop_words = pd.DataFrame(word_list.most_common(20))
stop_words.columns = ['Common_words','Frequency']
stop_words.style.background_gradient(cmap = 'Blues')
def addlabels(x,y):
for i in range(len(x)):
plt.text(i, y[i], y[i], ha = 'center')
x = stop_words['Common_words']
y = stop_words['Frequency']
plt.figure(figsize = (14,8))
plt.title( " Common words before stop words removal")
plt.xlabel("Common_words")
plt.ylabel("Frequency")
plt.bar(x,y)
#plt.gca().invert_yaxis()
addlabels(x, y)
plt.show()
plt.figure(figsize=(5, 3))
<Figure size 500x300 with 0 Axes>
<Figure size 500x300 with 0 Axes>
word_list = Counter([item for sublist in df_climate['word_list_with_sw'] for item in sublist])
stop_words = pd.DataFrame(word_list.most_common(5000))
stop_words.head(5)
| 0 | 1 | |
|---|---|---|
| 0 | climate | 3432 |
| 1 | change | 3124 |
| 2 | global | 2932 |
| 3 | warming | 2822 |
| 4 | the | 2288 |
stop_words.tail(5)
| 0 | 1 | |
|---|---|---|
| 4995 | signals | 2 |
| 4996 | jimmy | 2 |
| 4997 | carter | 2 |
| 4998 | nan | 2 |
| 4999 | talese | 2 |
def data_preprocessing(text):
text = str(text).lower() # converting to lower
text = " ".join([contractions.fix(expanded_word) for expanded_word in text.split()]) #Removing contractions
text = re.sub('\[.*?\]', '', text) # Removing text in square brackets
text = re.sub('https?://\S+|www\.\S+', '', text) #Removing links
text = re.sub(r'@\w+', '', text) #Removing words with @username
text = re.sub(r'#\w+', '', text) #Removing words with hashtags
text = re.sub('[^a-zA-Z#]', ' ', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(punctuation), '', text) #Removing Punctuations
text = re.sub('\w*\d\w*', '', text) #Removing words containing numbers
return text
# apply clean text fuction on each twitte in the training dataset
df_climate['Cleansed_Tweets'] = df_climate['tweet'].apply(lambda x:data_preprocessing(x))
df_climate.head(10)
| tweet | compound | negative | neutral | positive | Predicted Sentiment | Tweets_with_sw | word_list_with_sw | Cleansed_Tweets | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 | Positive | global warming report urges governments to act... | [global, warming, report, urges, governments, ... | global warming report urges governments to act... |
| 1 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 | Negative | fighting poverty and global warming in africa | [fighting, poverty, and, global, warming, in, ... | fighting poverty and global warming in africa |
| 2 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 | Negative | carbon offsets how a vatican forest failed to... | [carbon, offsets, how, a, vatican, forest, fai... | carbon offsets how a vatican forest failed to... |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 | Negative | uruguay tools needed for those most vulnerabl... | [uruguay, tools, needed, for, those, most, vul... | uruguay tools needed for those most vulnerabl... |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 | Positive | rt sejorg rt jaymiheimbuch ocean saltiness... | [rt, sejorg, rt, jaymiheimbuch, ocean, saltine... | rt rt ocean saltiness shows global warming... |
| 5 | Global warming evidence all around us|A messag... | -0.2960 | 0.195 | 0.664 | 0.142 | Negative | global warming evidence all around us a messag... | [global, warming, evidence, all, around, us, a... | global warming evidence all around us a messag... |
| 6 | Migratory Birds' New Climate Change Strategy: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | migratory birds new climate change strategy ... | [migratory, birds, new, climate, change, strat... | migratory birds new climate change strategy ... |
| 7 | Southern Africa: Competing for Limpopo Water: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | southern africa competing for limpopo water ... | [southern, africa, competing, for, limpopo, wa... | southern africa competing for limpopo water ... |
| 8 | Global warming to impact wheat, rice productio... | 0.0772 | 0.065 | 0.854 | 0.080 | Positive | global warming to impact wheat rice productio... | [global, warming, to, impact, wheat, rice, pro... | global warming to impact wheat rice productio... |
| 9 | How do we solve this global warming thing? [link] | 0.3400 | 0.000 | 0.673 | 0.327 | Positive | how do we solve this global warming thing | [how, do, we, solve, this, global, warming, th... | how do we solve this global warming thing |
#Remove stop words
import nltk
nltk.download('stopwords')
stopwords=nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
clean_text=' '.join([word for word in text.split() if word not in stopwords])
return clean_text
[nltk_data] Downloading package stopwords to [nltk_data] /Users/thavaseelan/nltk_data... [nltk_data] Package stopwords is already up-to-date!
df_climate['Cleansed_Tweets'] = df_climate['Cleansed_Tweets'].apply(lambda text : remove_stopwords(text.lower()))
df_climate.head(n=10)
| tweet | compound | negative | neutral | positive | Predicted Sentiment | Tweets_with_sw | word_list_with_sw | Cleansed_Tweets | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 | Positive | global warming report urges governments to act... | [global, warming, report, urges, governments, ... | global warming report urges governments act br... |
| 1 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 | Negative | fighting poverty and global warming in africa | [fighting, poverty, and, global, warming, in, ... | fighting poverty global warming africa |
| 2 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 | Negative | carbon offsets how a vatican forest failed to... | [carbon, offsets, how, a, vatican, forest, fai... | carbon offsets vatican forest failed reduce gl... |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 | Negative | uruguay tools needed for those most vulnerabl... | [uruguay, tools, needed, for, those, most, vul... | uruguay tools needed vulnerable climate change |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 | Positive | rt sejorg rt jaymiheimbuch ocean saltiness... | [rt, sejorg, rt, jaymiheimbuch, ocean, saltine... | rt rt ocean saltiness shows global warming int... |
| 5 | Global warming evidence all around us|A messag... | -0.2960 | 0.195 | 0.664 | 0.142 | Negative | global warming evidence all around us a messag... | [global, warming, evidence, all, around, us, a... | global warming evidence around us message glob... |
| 6 | Migratory Birds' New Climate Change Strategy: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | migratory birds new climate change strategy ... | [migratory, birds, new, climate, change, strat... | migratory birds new climate change strategy st... |
| 7 | Southern Africa: Competing for Limpopo Water: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | southern africa competing for limpopo water ... | [southern, africa, competing, for, limpopo, wa... | southern africa competing limpopo water climat... |
| 8 | Global warming to impact wheat, rice productio... | 0.0772 | 0.065 | 0.854 | 0.080 | Positive | global warming to impact wheat rice productio... | [global, warming, to, impact, wheat, rice, pro... | global warming impact wheat rice production in... |
| 9 | How do we solve this global warming thing? [link] | 0.3400 | 0.000 | 0.673 | 0.327 | Positive | how do we solve this global warming thing | [how, do, we, solve, this, global, warming, th... | solve global warming thing |
df_climate['Cleansed_Tweets'] = df_climate['Cleansed_Tweets'].apply(lambda x: x.split())
df_climate.head(n=10)
| tweet | compound | negative | neutral | positive | Predicted Sentiment | Tweets_with_sw | word_list_with_sw | Cleansed_Tweets | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 | Positive | global warming report urges governments to act... | [global, warming, report, urges, governments, ... | [global, warming, report, urges, governments, ... |
| 1 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 | Negative | fighting poverty and global warming in africa | [fighting, poverty, and, global, warming, in, ... | [fighting, poverty, global, warming, africa] |
| 2 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 | Negative | carbon offsets how a vatican forest failed to... | [carbon, offsets, how, a, vatican, forest, fai... | [carbon, offsets, vatican, forest, failed, red... |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 | Negative | uruguay tools needed for those most vulnerabl... | [uruguay, tools, needed, for, those, most, vul... | [uruguay, tools, needed, vulnerable, climate, ... |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 | Positive | rt sejorg rt jaymiheimbuch ocean saltiness... | [rt, sejorg, rt, jaymiheimbuch, ocean, saltine... | [rt, rt, ocean, saltiness, shows, global, warm... |
| 5 | Global warming evidence all around us|A messag... | -0.2960 | 0.195 | 0.664 | 0.142 | Negative | global warming evidence all around us a messag... | [global, warming, evidence, all, around, us, a... | [global, warming, evidence, around, us, messag... |
| 6 | Migratory Birds' New Climate Change Strategy: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | migratory birds new climate change strategy ... | [migratory, birds, new, climate, change, strat... | [migratory, birds, new, climate, change, strat... |
| 7 | Southern Africa: Competing for Limpopo Water: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | southern africa competing for limpopo water ... | [southern, africa, competing, for, limpopo, wa... | [southern, africa, competing, limpopo, water, ... |
| 8 | Global warming to impact wheat, rice productio... | 0.0772 | 0.065 | 0.854 | 0.080 | Positive | global warming to impact wheat rice productio... | [global, warming, to, impact, wheat, rice, pro... | [global, warming, impact, wheat, rice, product... |
| 9 | How do we solve this global warming thing? [link] | 0.3400 | 0.000 | 0.673 | 0.327 | Positive | how do we solve this global warming thing | [how, do, we, solve, this, global, warming, th... | [solve, global, warming, thing] |
from nltk.stem.porter import *
lemmatizer = WordNetLemmatizer()
df_climate['Cleansed_Tweets'] = df_climate['Cleansed_Tweets'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df_climate.head(n=10)
| tweet | compound | negative | neutral | positive | Predicted Sentiment | Tweets_with_sw | word_list_with_sw | Cleansed_Tweets | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 | Positive | global warming report urges governments to act... | [global, warming, report, urges, governments, ... | [global, warming, report, urge, government, ac... |
| 1 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 | Negative | fighting poverty and global warming in africa | [fighting, poverty, and, global, warming, in, ... | [fighting, poverty, global, warming, africa] |
| 2 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 | Negative | carbon offsets how a vatican forest failed to... | [carbon, offsets, how, a, vatican, forest, fai... | [carbon, offset, vatican, forest, failed, redu... |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 | Negative | uruguay tools needed for those most vulnerabl... | [uruguay, tools, needed, for, those, most, vul... | [uruguay, tool, needed, vulnerable, climate, c... |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 | Positive | rt sejorg rt jaymiheimbuch ocean saltiness... | [rt, sejorg, rt, jaymiheimbuch, ocean, saltine... | [rt, rt, ocean, saltiness, show, global, warmi... |
| 5 | Global warming evidence all around us|A messag... | -0.2960 | 0.195 | 0.664 | 0.142 | Negative | global warming evidence all around us a messag... | [global, warming, evidence, all, around, us, a... | [global, warming, evidence, around, u, message... |
| 6 | Migratory Birds' New Climate Change Strategy: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | migratory birds new climate change strategy ... | [migratory, birds, new, climate, change, strat... | [migratory, bird, new, climate, change, strate... |
| 7 | Southern Africa: Competing for Limpopo Water: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | southern africa competing for limpopo water ... | [southern, africa, competing, for, limpopo, wa... | [southern, africa, competing, limpopo, water, ... |
| 8 | Global warming to impact wheat, rice productio... | 0.0772 | 0.065 | 0.854 | 0.080 | Positive | global warming to impact wheat rice productio... | [global, warming, to, impact, wheat, rice, pro... | [global, warming, impact, wheat, rice, product... |
| 9 | How do we solve this global warming thing? [link] | 0.3400 | 0.000 | 0.673 | 0.327 | Positive | how do we solve this global warming thing | [how, do, we, solve, this, global, warming, th... | [solve, global, warming, thing] |
#DeTokenization the tokens back together
df_climate['Cleansed_Tweets'] = df_climate['Cleansed_Tweets'].apply(lambda x: ' '.join([w for w in x]))
df_climate.head(n=10)
| tweet | compound | negative | neutral | positive | Predicted Sentiment | Tweets_with_sw | word_list_with_sw | Cleansed_Tweets | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 | Positive | global warming report urges governments to act... | [global, warming, report, urges, governments, ... | global warming report urge government act brus... |
| 1 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 | Negative | fighting poverty and global warming in africa | [fighting, poverty, and, global, warming, in, ... | fighting poverty global warming africa |
| 2 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 | Negative | carbon offsets how a vatican forest failed to... | [carbon, offsets, how, a, vatican, forest, fai... | carbon offset vatican forest failed reduce glo... |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 | Negative | uruguay tools needed for those most vulnerabl... | [uruguay, tools, needed, for, those, most, vul... | uruguay tool needed vulnerable climate change |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 | Positive | rt sejorg rt jaymiheimbuch ocean saltiness... | [rt, sejorg, rt, jaymiheimbuch, ocean, saltine... | rt rt ocean saltiness show global warming inte... |
| 5 | Global warming evidence all around us|A messag... | -0.2960 | 0.195 | 0.664 | 0.142 | Negative | global warming evidence all around us a messag... | [global, warming, evidence, all, around, us, a... | global warming evidence around u message globa... |
| 6 | Migratory Birds' New Climate Change Strategy: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | migratory birds new climate change strategy ... | [migratory, birds, new, climate, change, strat... | migratory bird new climate change strategy sta... |
| 7 | Southern Africa: Competing for Limpopo Water: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | southern africa competing for limpopo water ... | [southern, africa, competing, for, limpopo, wa... | southern africa competing limpopo water climat... |
| 8 | Global warming to impact wheat, rice productio... | 0.0772 | 0.065 | 0.854 | 0.080 | Positive | global warming to impact wheat rice productio... | [global, warming, to, impact, wheat, rice, pro... | global warming impact wheat rice production in... |
| 9 | How do we solve this global warming thing? [link] | 0.3400 | 0.000 | 0.673 | 0.327 | Positive | how do we solve this global warming thing | [how, do, we, solve, this, global, warming, th... | solve global warming thing |
df_climate['Cleansed_Tweets'] = df_climate['Cleansed_Tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
df_climate.head(n=10)
| tweet | compound | negative | neutral | positive | Predicted Sentiment | Tweets_with_sw | word_list_with_sw | Cleansed_Tweets | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 | Positive | global warming report urges governments to act... | [global, warming, report, urges, governments, ... | global warming report urge government brussels... |
| 1 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 | Negative | fighting poverty and global warming in africa | [fighting, poverty, and, global, warming, in, ... | fighting poverty global warming africa |
| 2 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 | Negative | carbon offsets how a vatican forest failed to... | [carbon, offsets, how, a, vatican, forest, fai... | carbon offset vatican forest failed reduce glo... |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 | Negative | uruguay tools needed for those most vulnerabl... | [uruguay, tools, needed, for, those, most, vul... | uruguay tool needed vulnerable climate change |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 | Positive | rt sejorg rt jaymiheimbuch ocean saltiness... | [rt, sejorg, rt, jaymiheimbuch, ocean, saltine... | ocean saltiness show global warming intensifyi... |
| 5 | Global warming evidence all around us|A messag... | -0.2960 | 0.195 | 0.664 | 0.142 | Negative | global warming evidence all around us a messag... | [global, warming, evidence, all, around, us, a... | global warming evidence around message global ... |
| 6 | Migratory Birds' New Climate Change Strategy: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | migratory birds new climate change strategy ... | [migratory, birds, new, climate, change, strat... | migratory bird climate change strategy stay home |
| 7 | Southern Africa: Competing for Limpopo Water: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | southern africa competing for limpopo water ... | [southern, africa, competing, for, limpopo, wa... | southern africa competing limpopo water climat... |
| 8 | Global warming to impact wheat, rice productio... | 0.0772 | 0.065 | 0.854 | 0.080 | Positive | global warming to impact wheat rice productio... | [global, warming, to, impact, wheat, rice, pro... | global warming impact wheat rice production in... |
| 9 | How do we solve this global warming thing? [link] | 0.3400 | 0.000 | 0.673 | 0.327 | Positive | how do we solve this global warming thing | [how, do, we, solve, this, global, warming, th... | solve global warming thing |
df_climate['Cleansed_Tweets_word_list'] = df_climate['Cleansed_Tweets'].apply(lambda x:word_tokenize(str(x)))
clean_text = Counter([item for sublist in df_climate['Cleansed_Tweets_word_list'] for item in sublist])
Cleansed_Text = pd.DataFrame(clean_text.most_common(20))
Cleansed_Text.columns = ['Cleansed Text','Frequency']
Cleansed_Text.style.background_gradient(cmap = 'Blues')
x = Cleansed_Text['Cleansed Text']
y = Cleansed_Text['Frequency']
plt.figure(figsize = (10,8))
#indexes = np.arange(len(y))
plt.title("Cleansed Text")
plt.xlabel("Frequency")
plt.ylabel("Cleansed Text")
plt.barh( x,y, color = 'slateblue')
plt.gca().invert_yaxis()
#addlabels(x, y)
for i, v in enumerate(y):
plt.text(v + 20, i, str(v), ha='center')
plt.show()
clean_text = Counter([item for sublist in df_climate['Cleansed_Tweets_word_list'] for item in sublist])
Cleansed_Text = pd.DataFrame(clean_text.most_common(500))
Cleansed_Text.head(5)
| 0 | 1 | |
|---|---|---|
| 0 | climate | 3192 |
| 1 | change | 3114 |
| 2 | global | 2879 |
| 3 | warming | 2819 |
| 4 | snow | 241 |
Cleansed_Text.tail(5)
| 0 | 1 | |
|---|---|---|
| 495 | meeting | 15 |
| 496 | either | 15 |
| 497 | poisoning | 15 |
| 498 | economic | 15 |
| 499 | flood | 15 |
def multiple_word_remove_func(text, words_2_remove_list):
words_to_remove_list = words_2_remove_list
words = word_tokenize(text)
text = ' '.join([word for word in words if word not in words_to_remove_list])
return text
list_with_words = ["aaaaaaa","report","climate","change","global","warming","april","gore","bolivia","california","india","obama","federal","senate","snow","aaaah","aaand","five","esty","forty","nine","aahhhhh","blog","virginia","tcot","ippc","via","bill"]
df_climate['Cleansed_Tweets'] = df_climate['Cleansed_Tweets'].apply(lambda x:multiple_word_remove_func(x, list_with_words))
df_climate['Processed_Tweets'] = df_climate['Cleansed_Tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
df_climate.head(n=10)
| tweet | compound | negative | neutral | positive | Predicted Sentiment | Tweets_with_sw | word_list_with_sw | Cleansed_Tweets | Cleansed_Tweets_word_list | Processed_Tweets | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 | Positive | global warming report urges governments to act... | [global, warming, report, urges, governments, ... | urge government brussels belgium world face in... | [global, warming, report, urge, government, br... | urge government brussels belgium world face in... |
| 1 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 | Negative | fighting poverty and global warming in africa | [fighting, poverty, and, global, warming, in, ... | fighting poverty africa | [fighting, poverty, global, warming, africa] | fighting poverty africa |
| 2 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 | Negative | carbon offsets how a vatican forest failed to... | [carbon, offsets, how, a, vatican, forest, fai... | carbon offset vatican forest failed reduce | [carbon, offset, vatican, forest, failed, redu... | carbon offset vatican forest failed reduce |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 | Negative | uruguay tools needed for those most vulnerabl... | [uruguay, tools, needed, for, those, most, vul... | uruguay tool needed vulnerable | [uruguay, tool, needed, vulnerable, climate, c... | uruguay tool needed vulnerable |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 | Positive | rt sejorg rt jaymiheimbuch ocean saltiness... | [rt, sejorg, rt, jaymiheimbuch, ocean, saltine... | ocean saltiness show intensifying water cycle | [ocean, saltiness, show, global, warming, inte... | ocean saltiness show intensifying water cycle |
| 5 | Global warming evidence all around us|A messag... | -0.2960 | 0.195 | 0.664 | 0.142 | Negative | global warming evidence all around us a messag... | [global, warming, evidence, all, around, us, a... | evidence around message denier doubter look ar... | [global, warming, evidence, around, message, g... | evidence around message denier doubter look ar... |
| 6 | Migratory Birds' New Climate Change Strategy: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | migratory birds new climate change strategy ... | [migratory, birds, new, climate, change, strat... | migratory bird strategy stay home | [migratory, bird, climate, change, strategy, s... | migratory bird strategy stay home |
| 7 | Southern Africa: Competing for Limpopo Water: ... | 0.0000 | 0.000 | 1.000 | 0.000 | Neutral | southern africa competing for limpopo water ... | [southern, africa, competing, for, limpopo, wa... | southern africa competing limpopo water bring ... | [southern, africa, competing, limpopo, water, ... | southern africa competing limpopo water bring ... |
| 8 | Global warming to impact wheat, rice productio... | 0.0772 | 0.065 | 0.854 | 0.080 | Positive | global warming to impact wheat rice productio... | [global, warming, to, impact, wheat, rice, pro... | impact wheat rice production ludhiana scarcity... | [global, warming, impact, wheat, rice, product... | impact wheat rice production ludhiana scarcity... |
| 9 | How do we solve this global warming thing? [link] | 0.3400 | 0.000 | 0.673 | 0.327 | Positive | how do we solve this global warming thing | [how, do, we, solve, this, global, warming, th... | solve thing | [solve, global, warming, thing] | solve thing |
df_climate = df_climate.drop(df_climate[df_climate['Processed_Tweets']==''].index).reset_index(drop=True)
df_climate['Final_Tweets_word_list'] = df_climate['Processed_Tweets'].apply(lambda x:word_tokenize(str(x)))
df_climate.head(5)
| tweet | compound | negative | neutral | positive | Predicted Sentiment | Tweets_with_sw | word_list_with_sw | Cleansed_Tweets | Cleansed_Tweets_word_list | Processed_Tweets | Final_Tweets_word_list | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | 0.1779 | 0.102 | 0.711 | 0.188 | Positive | global warming report urges governments to act... | [global, warming, report, urges, governments, ... | urge government brussels belgium world face in... | [global, warming, report, urge, government, br... | urge government brussels belgium world face in... | [urge, government, brussels, belgium, world, f... |
| 1 | Fighting poverty and global warming in Africa ... | -0.6369 | 0.468 | 0.403 | 0.129 | Negative | fighting poverty and global warming in africa | [fighting, poverty, and, global, warming, in, ... | fighting poverty africa | [fighting, poverty, global, warming, africa] | fighting poverty africa | [fighting, poverty, africa] |
| 2 | Carbon offsets: How a Vatican forest failed to... | -0.4019 | 0.237 | 0.647 | 0.115 | Negative | carbon offsets how a vatican forest failed to... | [carbon, offsets, how, a, vatican, forest, fai... | carbon offset vatican forest failed reduce | [carbon, offset, vatican, forest, failed, redu... | carbon offset vatican forest failed reduce | [carbon, offset, vatican, forest, failed, reduce] |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | -0.2944 | 0.180 | 0.820 | 0.000 | Negative | uruguay tools needed for those most vulnerabl... | [uruguay, tools, needed, for, those, most, vul... | uruguay tool needed vulnerable | [uruguay, tool, needed, vulnerable, climate, c... | uruguay tool needed vulnerable | [uruguay, tool, needed, vulnerable] |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | 0.1531 | 0.000 | 0.897 | 0.103 | Positive | rt sejorg rt jaymiheimbuch ocean saltiness... | [rt, sejorg, rt, jaymiheimbuch, ocean, saltine... | ocean saltiness show intensifying water cycle | [ocean, saltiness, show, global, warming, inte... | ocean saltiness show intensifying water cycle | [ocean, saltiness, show, intensifying, water, ... |
final_text = Counter([item for sublist in df_climate['Final_Tweets_word_list'] for item in sublist])
Processed_Text = pd.DataFrame(final_text.most_common(20))
Processed_Text.columns = ['Processed Text','Frequency']
Processed_Text.style.background_gradient(cmap = 'Blues')
x = Processed_Text['Processed Text']
y = Processed_Text['Frequency']
plt.figure(figsize = (10,8))
#indexes = np.arange(len(y))
plt.title("Processed Text")
plt.xlabel("Frequency")
plt.ylabel("Processed Text")
plt.barh( x,y, color = 'slateblue')
plt.gca().invert_yaxis()
#addlabels(x, y)
for i, v in enumerate(y):
plt.text(v + 20, i, str(v), ha='center')
plt.show()
final_text = Counter([item for sublist in df_climate['Final_Tweets_word_list'] for item in sublist])
Processed_Text = pd.DataFrame(final_text.most_common(1000))
Processed_Text.head(5)
| 0 | 1 | |
|---|---|---|
| 0 | news | 222 |
| 1 | science | 186 |
| 2 | people | 180 |
| 3 | scientist | 167 |
| 4 | energy | 164 |
Processed_Text.tail(5)
| 0 | 1 | |
|---|---|---|
| 995 | build | 8 |
| 996 | bogus | 8 |
| 997 | fading | 8 |
| 998 | technique | 8 |
| 999 | covered | 8 |
from wordcloud import WordCloud
word_list = Counter([item for sublist in df_climate['word_list_with_sw'] for item in sublist])
stop_words = pd.DataFrame(word_list.most_common(400))
stop_words.columns = ['Common_words','Frequency']
stop_words.style.background_gradient(cmap = 'Blues')
wordcloud_text = ','.join(list(stop_words['Common_words'].values))
wordcloud = WordCloud(background_color="white",max_words=5000, contour_width=3).generate(str(wordcloud_text))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Word cloud for Raw Tweets")
plt.axis("off")
plt.show()
final_word = Counter([item for sublist in df_climate['Final_Tweets_word_list'] for item in sublist])
cleansed_final_tweet = pd.DataFrame(final_word.most_common(400))
cleansed_final_tweet.columns = ['Cleansed words','Frequency']
cleansed_final_tweet.style.background_gradient(cmap = 'Blues')
wordcloud_text = ','.join(list(cleansed_final_tweet['Cleansed words'].values))
wordcloud = WordCloud(background_color="white",max_words=5000, contour_width=3).generate(str(wordcloud_text))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Word cloud for Cleansed Tweets")
plt.axis("off")
plt.show()
df1_positive = df_climate[['Predicted Sentiment','Final_Tweets_word_list']]
df1_positive = df1_positive[df1_positive['Predicted Sentiment']=='Positive']
top_positive = Counter([item for sublist in df1_positive['Final_Tweets_word_list'] for item in sublist])
positive_words = pd.DataFrame(top_positive.most_common(400))
positive_words.columns = ['Positive Words','Frequency']
positive_words.style.background_gradient(cmap = 'Blues')
wordcloud_text = ','.join(list(positive_words['Positive Words'].values))
wordcloud = WordCloud(background_color="white",max_words=50, contour_width=3).generate(str(wordcloud_text))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Word cloud for Cleansed text Positive")
plt.axis("off")
plt.show()
rejected_text=("aaaaaaa climate change global warming science study scientist report april gore bolivia could california india obama federal senate snow aaaah aaand five esty forty nine aahhhhh blog virginia tcot ippc via bill news")
wordcloud = WordCloud(background_color="white",max_words=5000, contour_width=3).generate(rejected_text)
plt.title("Word cloud for Rejected words")
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()
df1_positive = df_climate[['Predicted Sentiment','Final_Tweets_word_list']]
df1_positive = df1_positive[df1_positive['Predicted Sentiment']=='Positive']
top_positive = Counter([item for sublist in df1_positive['Final_Tweets_word_list'] for item in sublist])
words = ' '.join(top_positive)
count_words = len(words)
positive_words = pd.DataFrame(top_positive.most_common(20))
positive_words.columns = ['Positive Words','Frequency']
positive_words.style.background_gradient(cmap = 'Blues')
x = positive_words['Positive Words']
y = positive_words['Frequency']
plt.figure(figsize = (10,8))
#indexes = np.arange(len(y))
print("No. of vocabulary in positive tweets is:", count_words)
plt.title( " Common words in Positve Tweets ")
plt.xlabel("Frequency")
plt.ylabel("Positive Words")
plt.barh( x,y, color = 'slateblue')
plt.gca().invert_yaxis()
#addlabels(x, y)
for i, v in enumerate(y):
plt.text(v + 20, i, str(v), ha='center')
plt.show()
No. of vocabulary in positive tweets is: 30565
df1_positive = df_climate[['Predicted Sentiment','Final_Tweets_word_list']]
df1_positive = df1_positive[df1_positive['Predicted Sentiment']=='Positive']
top_positive = Counter([item for sublist in df1_positive['Final_Tweets_word_list'] for item in sublist])
positive_words = pd.DataFrame(top_positive.most_common(30000))
positive_words.columns = ['Positive Words','Frequency']
#positive_words.to_csv("positive_words.csv")
positive_words
| Positive Words | Frequency | |
|---|---|---|
| 0 | energy | 139 |
| 1 | news | 97 |
| 2 | science | 87 |
| 3 | scientist | 84 |
| 4 | could | 81 |
| ... | ... | ... |
| 3861 | august | 1 |
| 3862 | chalk | 1 |
| 3863 | abandoned | 1 |
| 3864 | scientifically | 1 |
| 3865 | challenged | 1 |
3866 rows × 2 columns
df1_positive_s = df1_positive[df1_positive['Predicted Sentiment']=='Positive']
df1_positive_s.reset_index(drop=True)
| Predicted Sentiment | Final_Tweets_word_list | |
|---|---|---|
| 0 | Positive | [urge, government, brussels, belgium, world, f... |
| 1 | Positive | [ocean, saltiness, show, intensifying, water, ... |
| 2 | Positive | [impact, wheat, rice, production, ludhiana, sc... |
| 3 | Positive | [solve, thing] |
| 4 | Positive | [preliminary, analysis, suggests, natural, cou... |
| ... | ... | ... |
| 2494 | Positive | [climbing, august, weather, first, unbelievabl... |
| 2495 | Positive | [phrase, abandoned, favor, luntz] |
| 2496 | Positive | [investigate, scientist, mann] |
| 2497 | Positive | [tube, parody, enjoy] |
| 2498 | Positive | [made, hair, brained, theory, scientifically, ... |
2499 rows × 2 columns
df1_negative = df_climate[['Predicted Sentiment','Final_Tweets_word_list']]
df1_negative_s = df1_negative[df1_negative['Predicted Sentiment']=='Negative']
negative = Counter([item for sublist in df1_negative_s['Final_Tweets_word_list'] for item in sublist])
negative_words = pd.DataFrame(negative.most_common(400))
negative_words.columns = ['Negative words','Frequency']
negative_words.style.background_gradient(cmap = 'Blues')
wordcloud_text = ','.join(list(negative_words['Negative words'].values))
wordcloud = WordCloud(background_color="white",max_words=5000, contour_width=3).generate(str(wordcloud_text))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Word cloud for Cleansed text Negative")
plt.axis("off")
plt.show()
df1_negative_s = df1_negative[df1_negative['Predicted Sentiment']=='Negative']
df1_negative_s.reset_index(drop=True)
| Predicted Sentiment | Final_Tweets_word_list | |
|---|---|---|
| 0 | Negative | [fighting, poverty, africa] |
| 1 | Negative | [carbon, offset, vatican, forest, failed, reduce] |
| 2 | Negative | [uruguay, tool, needed, vulnerable] |
| 3 | Negative | [evidence, around, message, denier, doubter, l... |
| 4 | Negative | [blamed, coastal, whale, migration, dwindles, ... |
| ... | ... | ... |
| 1602 | Negative | [baby, lowdown, next, series, question] |
| 1603 | Negative | [bird, lizard, fight] |
| 1604 | Negative | [hoax, find, answer, question, think, hoax, wo... |
| 1605 | Negative | [ecocide, used, prosecute, denier] |
| 1606 | Negative | [eyed, golfer, dare, tell, twenty, golfer, mak... |
1607 rows × 2 columns
df1_negative_s = df1_negative_s['Final_Tweets_word_list'].reset_index()
top_negative = Counter([item for sublist in df1_negative_s['Final_Tweets_word_list'] for item in sublist])
words = ' '.join(top_negative)
count_words_neg = len(words)
negative_text = pd.DataFrame(top_negative.most_common(20))
negative_text.columns = ['Negative words','Frequency']
negative_text.style.background_gradient(cmap = 'Blues')
def addlabels(x,y):
for i in range(len(x)):
plt.text(i, y[i], y[i], ha = 'center')
x = negative_text['Negative words']
y = negative_text['Frequency']
plt.figure(figsize = (14,8))
print("No. of vocabulary in negative tweets is:", count_words_neg)
plt.title( " Common words for Negative Sentiments ")
plt.xlabel("Negative words")
plt.ylabel("Frequency")
plt.barh( x,y, color = 'slateblue')
plt.gca().invert_yaxis()
#addlabels(x, y)
for i, v in enumerate(y):
plt.text(v+2, i, str(v), ha='center')
plt.show()
plt.figure(figsize=(5, 3))
No. of vocabulary in negative tweets is: 22546
<Figure size 500x300 with 0 Axes>
<Figure size 500x300 with 0 Axes>
top_negative = Counter([item for sublist in df1_negative_s['Final_Tweets_word_list'] for item in sublist])
negative_text = pd.DataFrame(top_negative.most_common(20000))
negative_text.columns = ['Negative words','Frequency']
#negative_text.to_csv("Negative_words.csv")
negative_text
| Negative words | Frequency | |
|---|---|---|
| 0 | fight | 73 |
| 1 | worse | 68 |
| 2 | news | 61 |
| 3 | scientist | 61 |
| 4 | make | 60 |
| ... | ... | ... |
| 2862 | surplus | 1 |
| 2863 | pretending | 1 |
| 2864 | ecocide | 1 |
| 2865 | eyed | 1 |
| 2866 | dare | 1 |
2867 rows × 2 columns
df1_neutral = df_climate[['Predicted Sentiment','Final_Tweets_word_list']]
df1_neutral_s = df1_neutral[df1_neutral['Predicted Sentiment']=='Neutral']
top_neutral = Counter([item for sublist in df1_neutral_s['Final_Tweets_word_list'] for item in sublist])
temp_df_after_cleansing = pd.DataFrame(top_neutral.most_common(400))
temp_df_after_cleansing.columns = ['Neutral words','Frequency']
temp_df_after_cleansing.style.background_gradient(cmap = 'Blues')
wordcloud_text = ','.join(list(temp_df_after_cleansing['Neutral words'].values))
wordcloud = WordCloud(background_color="white",max_words=5000, contour_width=3).generate(str(wordcloud_text))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Word cloud for Cleansed text Neutral")
plt.axis("off")
plt.show()
df1_neutral_s = df1_neutral[df1_neutral['Predicted Sentiment']=='Neutral']
df1_neutral_s.reset_index(drop=True)
| Predicted Sentiment | Final_Tweets_word_list | |
|---|---|---|
| 0 | Neutral | [migratory, bird, strategy, stay, home] |
| 1 | Neutral | [southern, africa, competing, limpopo, water, ... |
| 2 | Neutral | [ecotone, perspective] |
| 3 | Neutral | [researcher, track] |
| 4 | Neutral | [leader, national, indigenous, woman, life, gd... |
| ... | ... | ... |
| 1402 | Neutral | [daniel, plan, huffington, post] |
| 1403 | Neutral | [senator, prepare, compromise, reuters, yahoo,... |
| 1404 | Neutral | [thirty, million, year, palm, tree, alaska, va... |
| 1405 | Neutral | [fossil, fuel, dilemma, america, million, barr... |
| 1406 | Neutral | [mystery, shrinking, sheep] |
1407 rows × 2 columns
top_neutral = Counter([item for sublist in df1_neutral_s['Final_Tweets_word_list'] for item in sublist])
words = ' '.join(top_neutral)
count_words_neutral = len(words)
neutral_text = pd.DataFrame(top_neutral.most_common(20))
neutral_text.columns = ['Neutral words','Frequency']
neutral_text.style.background_gradient(cmap = 'Blues')
def addlabels(x,y):
for i in range(len(x)):
plt.text(i, y[i], y[i], ha = 'center')
x = neutral_text['Neutral words']
y = neutral_text['Frequency']
plt.figure(figsize = (14,8))
print("No. of vocabulary in neutral tweets is:", count_words_neutral)
plt.title( " Common words for Neutral Sentiments ")
plt.xlabel("Neutral words")
plt.ylabel("Frequency")
plt.barh( x,y, color = 'slateblue')
plt.gca().invert_yaxis()
#addlabels(x, y)
for i, v in enumerate(y):
plt.text(v+2, i, str(v), ha='center')
plt.show()
#plt.figure(figsize=(5, 3))
No. of vocabulary in neutral tweets is: 18625
top_neutral = Counter([item for sublist in df1_neutral_s['Final_Tweets_word_list'] for item in sublist])
neutral_text = pd.DataFrame(top_neutral.most_common(20000))
neutral_text.columns = ['Neutral words','Frequency']
#neutral_text.to_csv("Neutral_text.csv")
neutral_text
| Neutral words | Frequency | |
|---|---|---|
| 0 | people | 81 |
| 1 | world | 65 |
| 2 | news | 64 |
| 3 | conference | 57 |
| 4 | earth | 55 |
| ... | ... | ... |
| 2335 | vanished | 1 |
| 2336 | fossil | 1 |
| 2337 | dilemma | 1 |
| 2338 | barrel | 1 |
| 2339 | serve | 1 |
2340 rows × 2 columns
from nltk.probability import FreqDist
pos_freqdist = FreqDist(top_positive)
pos_freqdist.tabulate(30)
energy news science scientist could great help people world green earth volcano like time debate good think effect action weather make state study year blizzard would winter tell conference another
139 97 87 84 81 81 71 70 68 68 65 64 61 61 54 53 53 52 48 47 47 47 46 43 43 42 41 39 39 39
neg_freqdist = FreqDist(top_negative)
neg_freqdist.tabulate(30)
fight worse news scientist make science mean time stop blame cold think denier allergy washington green weather legislation hoax clinical trial winter storm blizzard world collagen people screaming government earth
73 68 61 61 60 57 54 54 52 50 48 44 43 43 37 36 35 34 33 32 32 31 31 31 30 30 29 29 28 28
neutral_freqdist = FreqDist(top_neutral)
neutral_freqdist.tabulate(30)
people world news conference earth agency impact immigration graham science right talk issue blizzard volcano time carbon senator reuters take green study panel prepare mother make expert summit indigenous scientist
81 65 64 57 55 50 47 45 44 42 40 37 33 33 32 32 30 30 28 27 27 27 27 25 25 24 24 23 22 22
pos_freqdist.plot(30)
<Axes: xlabel='Samples', ylabel='Counts'>
neg_freqdist.plot(30)
<Axes: xlabel='Samples', ylabel='Counts'>
neutral_freqdist.plot(30)
<Axes: xlabel='Samples', ylabel='Counts'>
# Import label encoder
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df_climate['Sentiment_Encoded']= label_encoder.fit_transform(df_climate['Predicted Sentiment'])
df_climate['Sentiment_Encoded'].unique()
array([2, 0, 1])
mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
mapping
{'Negative': 0, 'Neutral': 1, 'Positive': 2}
dataset = df_climate[['tweet','Processed_Tweets','Final_Tweets_word_list','Predicted Sentiment','Sentiment_Encoded']]
dataset.columns = ['tweet','Processed_Tweets','Final_Tweets_word_list','Predicted Sentiment','Sentiment_Encoded']
final_df_global_warm = df_climate[['tweet','Processed_Tweets','Final_Tweets_word_list','Predicted Sentiment','Sentiment_Encoded']]
final_df_global_warm.columns = ['tweet','Processed_Tweets','Final_Tweets_word_list', 'Predicted Sentiment','Sentiment_Encoded']
final_df_global_warm.head(5)
| tweet | Processed_Tweets | Final_Tweets_word_list | Predicted Sentiment | Sentiment_Encoded | |
|---|---|---|---|---|---|
| 0 | Global warming report urges governments to act... | urge government brussels belgium world face in... | [urge, government, brussels, belgium, world, f... | Positive | 2 |
| 1 | Fighting poverty and global warming in Africa ... | fighting poverty africa | [fighting, poverty, africa] | Negative | 0 |
| 2 | Carbon offsets: How a Vatican forest failed to... | carbon offset vatican forest failed reduce | [carbon, offset, vatican, forest, failed, reduce] | Negative | 0 |
| 3 | URUGUAY: Tools Needed for Those Most Vulnerabl... | uruguay tool needed vulnerable | [uruguay, tool, needed, vulnerable] | Negative | 0 |
| 4 | RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness... | ocean saltiness show intensifying water cycle | [ocean, saltiness, show, intensifying, water, ... | Positive | 2 |
final_df_global_warm.to_csv("Task3_Processed_Sentiments_global_warming.csv",index=False)
from sklearn.feature_extraction.text import CountVectorizer
# create a count vertorizer object and set the size of the vocabulary to 14000
cv = CountVectorizer(max_features = 8000, ngram_range = (1,2))
# convert the dtype of final tweet column to unicode string and convert them to bag of words
X = cv.fit_transform(final_df_global_warm['Processed_Tweets'].values.astype('U')).toarray()
y = final_df_global_warm['Sentiment_Encoded']
print("X: ", X[0:5])
X: [[0 0 0 ... 0 0 0] [0 0 0 ... 0 0 0] [0 0 0 ... 0 0 0] [0 0 0 ... 0 0 0] [0 0 0 ... 0 0 0]]
print("y: ",y[0:5])
y: 0 2 1 0 2 0 3 0 4 2 Name: Sentiment_Encoded, dtype: int64
X.shape
(5513, 8000)
y.shape
(5513,)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
print("number of tweets in training dataset: ", len(X_train))
print("number of tweets in testing dataset: ", len(X_test))
number of tweets in training dataset: 3859 number of tweets in testing dataset: 1654
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
def train_classifier_get_confusion_metric(model, X_train, y_train, X_test, y_test):
# train the model on training data
model.fit(X_train, y_train)
# evaluate the model
y_pred = model.predict(X_test)
# calculate the performance metrics
confusion = confusion_matrix(y_test, y_pred)
return confusion
start_lr = time.time()
model1_lr = LogisticRegression(max_iter = 1000, multi_class = "multinomial")
model1_lr.fit(X_train,y_train)
LogisticRegression(max_iter=1000, multi_class='multinomial')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000, multi_class='multinomial')
y_pred_train_lr = model1_lr.predict(X_train)
y_pred_test_lr = model1_lr.predict(X_test)
report1_lr = classification_report(y_test, y_pred_test_lr)
print(f"Classification Report: \n{report1_lr}")
Classification Report:
precision recall f1-score support
0 0.83 0.71 0.77 496
1 0.76 0.71 0.73 411
2 0.76 0.85 0.80 747
accuracy 0.78 1654
macro avg 0.78 0.76 0.77 1654
weighted avg 0.78 0.78 0.77 1654
acc_train_lr = accuracy_score(y_train, y_pred_train_lr)
acc_test_lr = accuracy_score(y_test, y_pred_test_lr)
prec_lr = precision_score(y_test, y_pred_test_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_test_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_test_lr, average='weighted')
print('Logistic Regression Accuracy Train percentage is:', round(acc_train_lr*100,2))
print('Logistic Regression Accuracy percentage is:', round(acc_test_lr*100,2))
print('Logistic Regression Precision percentage is:', round(prec_lr*100,2))
print('Logistic Regression Recall percentage is:', round(recall_lr*100,2))
print('Logistic Regression F1-score percentage is:', round(f1_score_lr*100,2))
Logistic Regression Accuracy Train percentage is: 97.28 Logistic Regression Accuracy percentage is: 77.57 Logistic Regression Precision percentage is: 77.84 Logistic Regression Recall percentage is: 77.57 Logistic Regression F1-score percentage is: 77.44
lr_confusion = train_classifier_get_confusion_metric(model1_lr, X_train, y_train, X_test, y_test)
print("Confusion Matrix: ", lr_confusion)
disp = sns.heatmap(lr_confusion, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
disp.plot()
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()
Confusion Matrix: [[354 39 103] [ 19 292 100] [ 56 54 637]]
end_lr = time.time()
Training_time_lr = end_lr-start_lr
print("The time of execution of Logistic Regression Model is :",
Training_time_lr , "ms")
The time of execution of Logistic Regression Model is : 21.522359132766724 ms
start_mnb = time.time()
params = {'alpha': [0.01,0.1,0.5,1,10],}
model2_mnb = GridSearchCV(MultinomialNB(), param_grid=params, cv=5, verbose=10)
model2_mnb
GridSearchCV(cv=5, estimator=MultinomialNB(),
param_grid={'alpha': [0.01, 0.1, 0.5, 1, 10]}, verbose=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=MultinomialNB(),
param_grid={'alpha': [0.01, 0.1, 0.5, 1, 10]}, verbose=10)MultinomialNB()
MultinomialNB()
model2_mnb.fit(X_train,y_train)
Fitting 5 folds for each of 5 candidates, totalling 25 fits [CV 1/5; 1/5] START alpha=0.01.................................................. [CV 1/5; 1/5] END ...................alpha=0.01;, score=0.710 total time= 0.9s [CV 2/5; 1/5] START alpha=0.01.................................................. [CV 2/5; 1/5] END ...................alpha=0.01;, score=0.720 total time= 0.8s [CV 3/5; 1/5] START alpha=0.01.................................................. [CV 3/5; 1/5] END ...................alpha=0.01;, score=0.750 total time= 0.8s [CV 4/5; 1/5] START alpha=0.01.................................................. [CV 4/5; 1/5] END ...................alpha=0.01;, score=0.703 total time= 0.8s [CV 5/5; 1/5] START alpha=0.01.................................................. [CV 5/5; 1/5] END ...................alpha=0.01;, score=0.702 total time= 0.8s [CV 1/5; 2/5] START alpha=0.1................................................... [CV 1/5; 2/5] END ....................alpha=0.1;, score=0.715 total time= 0.8s [CV 2/5; 2/5] START alpha=0.1................................................... [CV 2/5; 2/5] END ....................alpha=0.1;, score=0.729 total time= 0.8s [CV 3/5; 2/5] START alpha=0.1................................................... [CV 3/5; 2/5] END ....................alpha=0.1;, score=0.751 total time= 0.8s [CV 4/5; 2/5] START alpha=0.1................................................... [CV 4/5; 2/5] END ....................alpha=0.1;, score=0.694 total time= 0.9s [CV 5/5; 2/5] START alpha=0.1................................................... [CV 5/5; 2/5] END ....................alpha=0.1;, score=0.699 total time= 0.8s [CV 1/5; 3/5] START alpha=0.5................................................... [CV 1/5; 3/5] END ....................alpha=0.5;, score=0.706 total time= 0.8s [CV 2/5; 3/5] START alpha=0.5................................................... [CV 2/5; 3/5] END ....................alpha=0.5;, score=0.731 total time= 0.8s [CV 3/5; 3/5] START alpha=0.5................................................... [CV 3/5; 3/5] END ....................alpha=0.5;, score=0.753 total time= 0.9s [CV 4/5; 3/5] START alpha=0.5................................................... [CV 4/5; 3/5] END ....................alpha=0.5;, score=0.697 total time= 0.9s [CV 5/5; 3/5] START alpha=0.5................................................... [CV 5/5; 3/5] END ....................alpha=0.5;, score=0.694 total time= 0.8s [CV 1/5; 4/5] START alpha=1..................................................... [CV 1/5; 4/5] END ......................alpha=1;, score=0.701 total time= 0.8s [CV 2/5; 4/5] START alpha=1..................................................... [CV 2/5; 4/5] END ......................alpha=1;, score=0.737 total time= 0.8s [CV 3/5; 4/5] START alpha=1..................................................... [CV 3/5; 4/5] END ......................alpha=1;, score=0.751 total time= 0.8s [CV 4/5; 4/5] START alpha=1..................................................... [CV 4/5; 4/5] END ......................alpha=1;, score=0.709 total time= 0.8s [CV 5/5; 4/5] START alpha=1..................................................... [CV 5/5; 4/5] END ......................alpha=1;, score=0.693 total time= 1.0s [CV 1/5; 5/5] START alpha=10.................................................... [CV 1/5; 5/5] END .....................alpha=10;, score=0.632 total time= 1.0s [CV 2/5; 5/5] START alpha=10.................................................... [CV 2/5; 5/5] END .....................alpha=10;, score=0.685 total time= 0.8s [CV 3/5; 5/5] START alpha=10.................................................... [CV 3/5; 5/5] END .....................alpha=10;, score=0.724 total time= 0.8s [CV 4/5; 5/5] START alpha=10.................................................... [CV 4/5; 5/5] END .....................alpha=10;, score=0.671 total time= 0.8s [CV 5/5; 5/5] START alpha=10.................................................... [CV 5/5; 5/5] END .....................alpha=10;, score=0.665 total time= 0.7s
GridSearchCV(cv=5, estimator=MultinomialNB(),
param_grid={'alpha': [0.01, 0.1, 0.5, 1, 10]}, verbose=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=MultinomialNB(),
param_grid={'alpha': [0.01, 0.1, 0.5, 1, 10]}, verbose=10)MultinomialNB()
MultinomialNB()
y_pred_train_mnb = model2_mnb.predict(X_train)
y_pred_test_mnb = model2_mnb.predict(X_test)
report2_mnb = classification_report(y_test, y_pred_test_mnb)
print(f"Classification Report: \n{report2_mnb}")
Classification Report:
precision recall f1-score support
0 0.75 0.73 0.74 496
1 0.77 0.69 0.73 411
2 0.75 0.80 0.77 747
accuracy 0.75 1654
macro avg 0.75 0.74 0.75 1654
weighted avg 0.75 0.75 0.75 1654
print('Train Accuracy : %.3f'%model2_mnb.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%model2_mnb.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model2_mnb.best_score_)
print('Best Parameters : ',model2_mnb.best_params_)
Train Accuracy : 0.885
Test Accuracy : 0.752
Best Accuracy Through Grid Search : 0.718
Best Parameters : {'alpha': 1}
acc_train_mnb = accuracy_score(y_train, y_pred_train_mnb)
acc_test_mnb = accuracy_score(y_test, y_pred_test_mnb)
prec_mnb = precision_score(y_test, y_pred_test_mnb, average='weighted')
recall_mnb = recall_score(y_test, y_pred_test_mnb, average='weighted')
f1_score_mnb = f1_score(y_test, y_pred_test_mnb, average='weighted')
print('Multinomial Naive Bayes Accuracy Train percentage is:', round(acc_train_mnb*100,2))
print('Multinomial Naive Bayes Accuracy percentage is:', round(acc_test_mnb*100,2))
print('Multinomial Naive Bayes Precision percentage is:', round(prec_mnb*100,2))
print('Multinomial Naive Bayes Recall percentage is:', round(recall_mnb*100,2))
print('Multinomial Naive Bayes F1-score percentage is:', round(f1_score_mnb*100,2))
Multinomial Naive Bayes Accuracy Train percentage is: 88.49 Multinomial Naive Bayes Accuracy percentage is: 75.21 Multinomial Naive Bayes Precision percentage is: 75.26 Multinomial Naive Bayes Recall percentage is: 75.21 Multinomial Naive Bayes F1-score percentage is: 75.14
mnb_confusion = train_classifier_get_confusion_metric(model2_mnb, X_train, y_train, X_test, y_test)
print("Confusion Matrix: ", mnb_confusion)
disp = sns.heatmap(mnb_confusion, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
disp.plot()
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Multinomial Naive Bayes')
plt.show()
Fitting 5 folds for each of 5 candidates, totalling 25 fits [CV 1/5; 1/5] START alpha=0.01.................................................. [CV 1/5; 1/5] END ...................alpha=0.01;, score=0.710 total time= 0.7s [CV 2/5; 1/5] START alpha=0.01.................................................. [CV 2/5; 1/5] END ...................alpha=0.01;, score=0.720 total time= 0.7s [CV 3/5; 1/5] START alpha=0.01.................................................. [CV 3/5; 1/5] END ...................alpha=0.01;, score=0.750 total time= 0.7s [CV 4/5; 1/5] START alpha=0.01.................................................. [CV 4/5; 1/5] END ...................alpha=0.01;, score=0.703 total time= 0.7s [CV 5/5; 1/5] START alpha=0.01.................................................. [CV 5/5; 1/5] END ...................alpha=0.01;, score=0.702 total time= 0.7s [CV 1/5; 2/5] START alpha=0.1................................................... [CV 1/5; 2/5] END ....................alpha=0.1;, score=0.715 total time= 0.7s [CV 2/5; 2/5] START alpha=0.1................................................... [CV 2/5; 2/5] END ....................alpha=0.1;, score=0.729 total time= 1.5s [CV 3/5; 2/5] START alpha=0.1................................................... [CV 3/5; 2/5] END ....................alpha=0.1;, score=0.751 total time= 1.0s [CV 4/5; 2/5] START alpha=0.1................................................... [CV 4/5; 2/5] END ....................alpha=0.1;, score=0.694 total time= 0.8s [CV 5/5; 2/5] START alpha=0.1................................................... [CV 5/5; 2/5] END ....................alpha=0.1;, score=0.699 total time= 0.7s [CV 1/5; 3/5] START alpha=0.5................................................... [CV 1/5; 3/5] END ....................alpha=0.5;, score=0.706 total time= 0.7s [CV 2/5; 3/5] START alpha=0.5................................................... [CV 2/5; 3/5] END ....................alpha=0.5;, score=0.731 total time= 0.7s [CV 3/5; 3/5] START alpha=0.5................................................... [CV 3/5; 3/5] END ....................alpha=0.5;, score=0.753 total time= 0.7s [CV 4/5; 3/5] START alpha=0.5................................................... [CV 4/5; 3/5] END ....................alpha=0.5;, score=0.697 total time= 1.6s [CV 5/5; 3/5] START alpha=0.5................................................... [CV 5/5; 3/5] END ....................alpha=0.5;, score=0.694 total time= 0.9s [CV 1/5; 4/5] START alpha=1..................................................... [CV 1/5; 4/5] END ......................alpha=1;, score=0.701 total time= 0.9s [CV 2/5; 4/5] START alpha=1..................................................... [CV 2/5; 4/5] END ......................alpha=1;, score=0.737 total time= 0.8s [CV 3/5; 4/5] START alpha=1..................................................... [CV 3/5; 4/5] END ......................alpha=1;, score=0.751 total time= 0.8s [CV 4/5; 4/5] START alpha=1..................................................... [CV 4/5; 4/5] END ......................alpha=1;, score=0.709 total time= 0.7s [CV 5/5; 4/5] START alpha=1..................................................... [CV 5/5; 4/5] END ......................alpha=1;, score=0.693 total time= 0.7s [CV 1/5; 5/5] START alpha=10.................................................... [CV 1/5; 5/5] END .....................alpha=10;, score=0.632 total time= 0.7s [CV 2/5; 5/5] START alpha=10.................................................... [CV 2/5; 5/5] END .....................alpha=10;, score=0.685 total time= 0.8s [CV 3/5; 5/5] START alpha=10.................................................... [CV 3/5; 5/5] END .....................alpha=10;, score=0.724 total time= 0.8s [CV 4/5; 5/5] START alpha=10.................................................... [CV 4/5; 5/5] END .....................alpha=10;, score=0.671 total time= 0.7s [CV 5/5; 5/5] START alpha=10.................................................... [CV 5/5; 5/5] END .....................alpha=10;, score=0.665 total time= 0.8s Confusion Matrix: [[360 28 108] [ 33 284 94] [ 89 58 600]]
end_mnb = time.time()
Training_time_mnb = end_mnb-start_mnb
print("The time of execution of Multinomial Naive Bayes Model is :",
Training_time_mnb , "ms")
The time of execution of Multinomial Naive Bayes Model is : 44.67935800552368 ms
start_bnb = time.time()
model3_bnb = BernoulliNB()
model3_bnb = model3_bnb.fit(X_train,y_train)
model3_bnb
BernoulliNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
BernoulliNB()
y_pred_train_bnb = model3_bnb.predict(X_train)
y_pred_test_bnb = model3_bnb.predict(X_test)
report3_bnb = classification_report(y_test, y_pred_test_bnb)
print(f"Classification Report: \n{report3_bnb}")
Classification Report:
precision recall f1-score support
0 0.83 0.57 0.68 496
1 0.80 0.59 0.68 411
2 0.67 0.90 0.76 747
accuracy 0.72 1654
macro avg 0.77 0.69 0.71 1654
weighted avg 0.75 0.72 0.72 1654
acc_train_bnb = accuracy_score(y_train, y_pred_train_bnb)
acc_test_bnb = accuracy_score(y_test, y_pred_test_bnb)
prec_bnb = precision_score(y_test, y_pred_test_bnb, average='weighted')
recall_bnb = recall_score(y_test, y_pred_test_bnb, average='weighted')
f1_score_bnb = f1_score(y_test, y_pred_test_bnb, average='weighted')
print('BernoulliNB Accuracy Train percentage is:', round(acc_train_bnb*100,2))
print('BernoulliNB Accuracy percentage is:', round(acc_test_bnb*100,2))
print('BernoulliNB Precision percentage is:', round(prec_bnb*100,2))
print('BernoulliNB Recall percentage is:', round(recall_bnb*100,2))
print('BernoulliNB F1-score percentage is:', round(f1_score_bnb*100,2))
BernoulliNB Accuracy Train percentage is: 82.48 BernoulliNB Accuracy percentage is: 72.43 BernoulliNB Precision percentage is: 74.87 BernoulliNB Recall percentage is: 72.43 BernoulliNB F1-score percentage is: 71.74
bnb_confusion = train_classifier_get_confusion_metric(model3_bnb, X_train, y_train, X_test, y_test)
print("Confusion Matrix: ", bnb_confusion)
disp = sns.heatmap(bnb_confusion, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
disp.plot()
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Bernoulli Naive Bayes')
plt.show()
Confusion Matrix: [[285 25 186] [ 18 241 152] [ 39 36 672]]
end_bnb = time.time()
Training_time_bnb = end_bnb - start_bnb
print("The time of execution of Bernoulli Naive Bayes Model is :",
Training_time_bnb , "ms")
The time of execution of Bernoulli Naive Bayes Model is : 3.3895552158355713 ms
start_xg = time.time()
model4_xgb = XGBClassifier(random_state=42, n_estimators=100, max_depth=20, min_samples_split=2, min_samples_leaf=1)
model4_xgb = model4_xgb.fit(X_train,y_train)
model4_xgb
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=20, max_leaves=None,
min_child_weight=None, min_samples_leaf=1, min_samples_split=2,
missing=nan, monotone_constraints=None, multi_strategy=None,
n_estimators=100, n_jobs=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=20, max_leaves=None,
min_child_weight=None, min_samples_leaf=1, min_samples_split=2,
missing=nan, monotone_constraints=None, multi_strategy=None,
n_estimators=100, n_jobs=None, ...)y_pred_train_xgb = model4_xgb.predict(X_train)
y_pred_test_xgb = model4_xgb.predict(X_test)
report4_xgb = classification_report(y_test, y_pred_test_xgb)
print(f"Classification Report: \n{report4_xgb}")
Classification Report:
precision recall f1-score support
0 0.84 0.67 0.75 496
1 0.73 0.67 0.70 411
2 0.72 0.85 0.78 747
accuracy 0.75 1654
macro avg 0.76 0.73 0.74 1654
weighted avg 0.76 0.75 0.75 1654
acc_train_xgb = accuracy_score(y_train, y_pred_train_xgb)
acc_test_xgb = accuracy_score(y_test, y_pred_test_xgb)
prec_xgb = precision_score(y_test, y_pred_test_xgb, average='weighted')
recall_xgb = recall_score(y_test, y_pred_test_xgb, average='weighted')
f1_score_xgb = f1_score(y_test, y_pred_test_xgb, average='weighted')
print('XGBoost Classifier Accuracy Train percentage is:', round(acc_train_xgb*100,2))
print('XGBoost Classifier Accuracy percentage is:', round(acc_test_xgb*100,2))
print('XGBoost Classifier Precision percentage is:', round(prec_xgb*100,2))
print('XGBoost Classifier Recall percentage is:', round(recall_xgb*100,2))
print('XGBoost Classifier F1-score percentage is:', round(f1_score_xgb*100,2))
XGBoost Classifier Accuracy Train percentage is: 91.14 XGBoost Classifier Accuracy percentage is: 75.21 XGBoost Classifier Precision percentage is: 75.97 XGBoost Classifier Recall percentage is: 75.21 XGBoost Classifier F1-score percentage is: 75.01
xgb_confusion = train_classifier_get_confusion_metric(model4_xgb, X_train, y_train, X_test, y_test)
print("Confusion Matrix: ", xgb_confusion)
disp = sns.heatmap(xgb_confusion, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
disp.plot()
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for XGBoost Classifier')
plt.show()
Confusion Matrix: [[332 40 124] [ 13 275 123] [ 50 60 637]]
end_xg = time.time()
Training_time_xg = end_xg-start_xg
print("The time of execution of XGBoost Classifier Model is :",
Training_time_xg , "ms")
The time of execution of XGBoost Classifier Model is : 28.808629035949707 ms
## Random Forest Classifier
start_rfc = time.time()
# Model Building
model5_rfc = RandomForestClassifier(n_estimators=100, random_state=42)
model5_rfc.fit(X_train, y_train)
model5_rfc
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=42)
y_pred_train_rfc = model5_rfc.predict(X_train)
y_pred_test_rfc = model5_rfc.predict(X_test)
report5_rfc = classification_report(y_test, y_pred_test_rfc)
print(f"Classification Report: \n{report5_rfc}")
Classification Report:
precision recall f1-score support
0 0.84 0.74 0.79 496
1 0.76 0.72 0.74 411
2 0.76 0.84 0.80 747
accuracy 0.78 1654
macro avg 0.79 0.77 0.77 1654
weighted avg 0.78 0.78 0.78 1654
acc_train_rfc = accuracy_score(y_train, y_pred_train_rfc)
acc_test_rfc = accuracy_score(y_test, y_pred_test_rfc)
prec_rfc = precision_score(y_test, y_pred_test_rfc, average='weighted')
recall_rfc = recall_score(y_test, y_pred_test_rfc, average='weighted')
f1_score_rfc = f1_score(y_test, y_pred_test_rfc, average='weighted')
print('Random Forest Classifier Accuracy Train percentage is:', round(acc_train_rfc*100,2))
print('Random Forest Classifier Accuracy percentage is:', round(acc_test_rfc*100,2))
print('Random Forest Classifier Precision percentage is:', round(prec_rfc*100,2))
print('Random Forest Classifier Recall percentage is:', round(recall_rfc*100,2))
print('Random Forest Classifier F1-score percentage is:', round(f1_score_rfc*100,2))
Random Forest Classifier Accuracy Train percentage is: 99.51 Random Forest Classifier Accuracy percentage is: 78.05 Random Forest Classifier Precision percentage is: 78.33 Random Forest Classifier Recall percentage is: 78.05 Random Forest Classifier F1-score percentage is: 77.98
rfc_confusion = train_classifier_get_confusion_metric(model5_rfc, X_train, y_train, X_test, y_test)
print("Confusion Matrix: ", rfc_confusion)
disp = sns.heatmap(rfc_confusion, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
disp.plot()
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for XGBoost Classifier')
plt.show()
Confusion Matrix: [[366 36 94] [ 13 294 104] [ 57 59 631]]
end_rfc = time.time()
Training_time_rfc = end_rfc-start_rfc
print("The time of execution of Random Forest Classifier Model is :",
Training_time_rfc , "ms")
The time of execution of Random Forest Classifier Model is : 84.49133801460266 ms
model = models.Sequential()
# Input - Layer
model.add(layers.Dense(100, activation = "relu", input_shape=(8000, )))
model.add(tf.keras.layers.Flatten()) #Flattening the layer
model.add(tf.keras.layers.BatchNormalization())
# Hidden - Layers
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(Dropout(0.5)) #Dropout of 50% in each hidden layer
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(Dropout(0.5)) #Dropout of 50% in each hidden layer
# Output- Layer
model.add(layers.Dense(3, activation = "softmax"))
model.summary()
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ dense (Dense) │ (None, 100) │ 800,100 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ flatten (Flatten) │ (None, 100) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ batch_normalization │ (None, 100) │ 400 │ │ (BatchNormalization) │ │ │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_1 (Dense) │ (None, 50) │ 5,050 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout (Dropout) │ (None, 50) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_2 (Dense) │ (None, 50) │ 2,550 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_1 (Dropout) │ (None, 50) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_3 (Dense) │ (None, 3) │ 153 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 808,253 (3.08 MB)
Trainable params: 808,053 (3.08 MB)
Non-trainable params: 200 (800.00 B)
optimize = keras.optimizers.Adam(learning_rate=0.001)
#Compile model
model.compile(optimizer=optimize, loss=tf.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=['accuracy'])
from keras import callbacks
earlystopping = callbacks.EarlyStopping(monitor="val_loss",
mode="min",
patience=10,
restore_best_weights=True)
start = time.perf_counter()
results_nn = model.fit(
X_train,y_train,
epochs= 150,
batch_size = 512,
validation_data = (X_test, y_test),
verbose=True,
callbacks=[earlystopping]
)
elapsed_time_nn = time.perf_counter() - start
print("Elapsed time for Neural Network without sampling is %.2f seconds." %elapsed_time_nn)
Epoch 1/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 3s 100ms/step - accuracy: 0.3212 - loss: 1.3421 - val_accuracy: 0.4547 - val_loss: 1.0880 Epoch 2/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 72ms/step - accuracy: 0.4379 - loss: 1.1059 - val_accuracy: 0.4516 - val_loss: 1.0791 Epoch 3/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 71ms/step - accuracy: 0.5111 - loss: 0.9886 - val_accuracy: 0.4516 - val_loss: 1.0728 Epoch 4/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 69ms/step - accuracy: 0.5675 - loss: 0.9021 - val_accuracy: 0.4516 - val_loss: 1.0671 Epoch 5/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 70ms/step - accuracy: 0.6012 - loss: 0.8250 - val_accuracy: 0.4583 - val_loss: 1.0596 Epoch 6/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 83ms/step - accuracy: 0.6635 - loss: 0.7530 - val_accuracy: 0.4752 - val_loss: 1.0477 Epoch 7/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 120ms/step - accuracy: 0.7230 - loss: 0.6667 - val_accuracy: 0.4994 - val_loss: 1.0298 Epoch 8/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 88ms/step - accuracy: 0.7671 - loss: 0.5717 - val_accuracy: 0.5339 - val_loss: 1.0075 Epoch 9/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 69ms/step - accuracy: 0.8163 - loss: 0.4846 - val_accuracy: 0.5574 - val_loss: 0.9800 Epoch 10/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 78ms/step - accuracy: 0.8603 - loss: 0.3968 - val_accuracy: 0.6034 - val_loss: 0.9502 Epoch 11/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 81ms/step - accuracy: 0.8927 - loss: 0.3248 - val_accuracy: 0.6161 - val_loss: 0.9182 Epoch 12/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 84ms/step - accuracy: 0.9140 - loss: 0.2582 - val_accuracy: 0.6233 - val_loss: 0.8870 Epoch 13/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 59ms/step - accuracy: 0.9401 - loss: 0.1950 - val_accuracy: 0.6330 - val_loss: 0.8591 Epoch 14/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 62ms/step - accuracy: 0.9483 - loss: 0.1647 - val_accuracy: 0.6457 - val_loss: 0.8362 Epoch 15/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 61ms/step - accuracy: 0.9589 - loss: 0.1348 - val_accuracy: 0.6536 - val_loss: 0.8177 Epoch 16/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 62ms/step - accuracy: 0.9735 - loss: 0.1013 - val_accuracy: 0.6608 - val_loss: 0.7982 Epoch 17/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 62ms/step - accuracy: 0.9710 - loss: 0.0945 - val_accuracy: 0.6669 - val_loss: 0.7857 Epoch 18/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 64ms/step - accuracy: 0.9761 - loss: 0.0844 - val_accuracy: 0.6711 - val_loss: 0.7748 Epoch 19/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 63ms/step - accuracy: 0.9799 - loss: 0.0632 - val_accuracy: 0.6747 - val_loss: 0.7685 Epoch 20/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 77ms/step - accuracy: 0.9842 - loss: 0.0562 - val_accuracy: 0.6947 - val_loss: 0.7491 Epoch 21/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 62ms/step - accuracy: 0.9844 - loss: 0.0537 - val_accuracy: 0.7037 - val_loss: 0.7446 Epoch 22/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 62ms/step - accuracy: 0.9864 - loss: 0.0517 - val_accuracy: 0.7092 - val_loss: 0.7411 Epoch 23/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 62ms/step - accuracy: 0.9854 - loss: 0.0498 - val_accuracy: 0.7177 - val_loss: 0.7302 Epoch 24/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 64ms/step - accuracy: 0.9872 - loss: 0.0411 - val_accuracy: 0.7219 - val_loss: 0.7297 Epoch 25/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 61ms/step - accuracy: 0.9891 - loss: 0.0356 - val_accuracy: 0.7207 - val_loss: 0.7374 Epoch 26/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 62ms/step - accuracy: 0.9918 - loss: 0.0352 - val_accuracy: 0.7225 - val_loss: 0.7315 Epoch 27/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 63ms/step - accuracy: 0.9922 - loss: 0.0335 - val_accuracy: 0.7279 - val_loss: 0.7204 Epoch 28/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 62ms/step - accuracy: 0.9909 - loss: 0.0380 - val_accuracy: 0.7358 - val_loss: 0.7209 Epoch 29/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 80ms/step - accuracy: 0.9876 - loss: 0.0374 - val_accuracy: 0.7412 - val_loss: 0.7166 Epoch 30/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 63ms/step - accuracy: 0.9884 - loss: 0.0326 - val_accuracy: 0.7455 - val_loss: 0.7108 Epoch 31/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 62ms/step - accuracy: 0.9921 - loss: 0.0286 - val_accuracy: 0.7491 - val_loss: 0.7178 Epoch 32/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 66ms/step - accuracy: 0.9920 - loss: 0.0283 - val_accuracy: 0.7503 - val_loss: 0.7241 Epoch 33/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 65ms/step - accuracy: 0.9944 - loss: 0.0230 - val_accuracy: 0.7527 - val_loss: 0.7247 Epoch 34/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 66ms/step - accuracy: 0.9909 - loss: 0.0269 - val_accuracy: 0.7600 - val_loss: 0.7171 Epoch 35/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 64ms/step - accuracy: 0.9920 - loss: 0.0258 - val_accuracy: 0.7600 - val_loss: 0.7219 Epoch 36/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 63ms/step - accuracy: 0.9925 - loss: 0.0230 - val_accuracy: 0.7594 - val_loss: 0.7355 Epoch 37/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 62ms/step - accuracy: 0.9934 - loss: 0.0280 - val_accuracy: 0.7600 - val_loss: 0.7428 Epoch 38/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 63ms/step - accuracy: 0.9927 - loss: 0.0268 - val_accuracy: 0.7684 - val_loss: 0.7291 Epoch 39/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 63ms/step - accuracy: 0.9943 - loss: 0.0190 - val_accuracy: 0.7654 - val_loss: 0.7303 Epoch 40/150 8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 77ms/step - accuracy: 0.9914 - loss: 0.0259 - val_accuracy: 0.7666 - val_loss: 0.7548 Elapsed time for Neural Network without sampling is 27.82 seconds.
hist_df1 = pd.DataFrame(results_nn.history)
hist_df1.loc[:, ['loss','val_loss']].plot()
hist_df1.loc[:, ['accuracy','val_accuracy']].plot()
plt.show()
y_pred_train_nn = model.predict(X_train)
y_pred_train_nn = y_pred_train_nn.argmax(axis=1)
121/121 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step
y_pred_test_nn = model.predict(X_test)
y_pred_test_nn = y_pred_test_nn.argmax(axis=1)
52/52 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
cm_nn = confusion_matrix(y_test, y_pred_test_nn)
ax = sns.heatmap(cm_nn, annot=True, fmt='d', cmap='flare', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted Class", fontsize=12)
plt.ylabel("True Class",fontsize=12)
plt.title("Confusion Matrix", fontsize=12)
plt.show()
print(classification_report(y_test,y_pred_test_nn))
precision recall f1-score support
0 0.87 0.64 0.74 496
1 0.94 0.49 0.65 411
2 0.66 0.95 0.78 747
accuracy 0.75 1654
macro avg 0.82 0.70 0.72 1654
weighted avg 0.79 0.75 0.74 1654
acc_train_nn = accuracy_score(y_train, y_pred_train_nn)
acc_test_nn = accuracy_score(y_test, y_pred_test_nn)
prec_nn = precision_score(y_test, y_pred_test_nn, average='weighted')
recall_nn = recall_score(y_test, y_pred_test_nn, average='weighted')
f1_score_nn = f1_score(y_test, y_pred_test_nn, average='weighted')
print('Neural Network Accuracy Train percentage is:', round(acc_train_nn*100,2))
print('Neural Network Accuracy Test Percentage is:', round(acc_test_nn*100,2))
print('Neural Network Precision percentage is:', round(prec_nn*100,2))
print('Neural Network Recall percentage is:', round(recall_nn*100,2))
print('Neural Network F1-score percentage is:', round(f1_score_nn*100,2))
Neural Network Accuracy Train percentage is: 94.45 Neural Network Accuracy Test Percentage is: 74.55 Neural Network Precision percentage is: 79.44 Neural Network Recall percentage is: 74.55 Neural Network F1-score percentage is: 73.57
print("The time of execution of Recurrent Neural Network Model is :",
elapsed_time_nn , "ms")
The time of execution of Recurrent Neural Network Model is : 27.819829301000027 ms
pd.set_option('display.width',1000)
df1 = pd.DataFrame([
['Logistic Regression', round(acc_train_lr*100,2), round(acc_test_lr*100,2), round(prec_lr*100,2), round(recall_lr*100,2), round(f1_score_lr*100,2)],
['XGBoost', round(acc_train_xgb*100,2), round(acc_test_xgb*100,2), round(prec_xgb*100,2), round(recall_xgb*100,2), round(f1_score_xgb*100,2)],
['Multinomial Naive Bayes', round(acc_train_mnb*100,2), round(acc_test_mnb*100,2), round(prec_mnb*100,2), round(recall_mnb*100,2), round(f1_score_mnb*100,2)],
['Bernoulli Naive Bayes', round(acc_train_bnb*100,2), round(acc_test_bnb*100,2), round(prec_bnb*100,2), round(recall_bnb*100,2), round(f1_score_bnb*100,2)],
['Random Forest Classifier', round(acc_train_rfc*100,2), round(acc_test_rfc*100,2), round(prec_rfc*100,2), round(recall_rfc*100,2), round(f1_score_rfc*100,2)],
['Neural Network', round(acc_train_nn*100,2), round(acc_test_nn*100,2), round(prec_nn*100,2), round(recall_nn*100,2), round(f1_score_nn*100,2)]
],
columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'f1-Score']
)
print("Performance metrics for Climate Change Tweets without Sampling(%)")
print(df1)
Performance metrics for Climate Change Tweets without Sampling(%)
Model Train Accuracy Test Accuracy Precision Recall f1-Score
0 Logistic Regression 97.28 77.57 77.84 77.57 77.44
1 XGBoost 91.14 75.21 75.97 75.21 75.01
2 Multinomial Naive Bayes 88.49 75.21 75.26 75.21 75.14
3 Bernoulli Naive Bayes 82.48 72.43 74.87 72.43 71.74
4 Random Forest Classifier 99.51 78.05 78.33 78.05 77.98
5 Neural Network 94.45 74.55 79.44 74.55 73.57
# plot data in stack manner of bar type
a = df1.plot(x='Model', kind='barh', stacked=True,figsize = (8,6), xlabel='Metrics Percentage', ylabel = 'Models',
title='Performance metrics(%)')
for c1 in a.containers:
#labels = [v.get_height() if v.get_height() > 0 else '' for v in c]
a.bar_label(c1, label_type = 'center')
plt.legend(loc='upper right', bbox_to_anchor=(0.6, 0., 0.6, 0.6))
plt.xticks(rotation=90)
plt.show()
metrics = [[acc_train_lr,acc_test_lr,prec_lr,recall_lr,f1_score_lr],
[acc_train_xgb,acc_test_xgb,prec_xgb,recall_xgb,f1_score_xgb],
[acc_train_mnb,acc_test_mnb,prec_mnb,recall_mnb,f1_score_mnb],
[acc_train_bnb,acc_test_bnb,prec_bnb,recall_bnb,f1_score_bnb],
[acc_train_rfc,acc_test_rfc,prec_rfc,recall_rfc,f1_score_rfc],
[acc_train_nn,acc_test_nn,prec_nn,recall_nn,f1_score_nn],
]
Final_result = pd.DataFrame(metrics, columns=['Train Accuracy', 'Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1 Score'],
index=['Logistic Regression','XGBoost','Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'Random Forest Classifier', 'Neural Network'])
Final_result.style.background_gradient(cmap='prism', axis=1)
| Train Accuracy | Test Accuracy | Test Precision | Test Recall | Test F1 Score | |
|---|---|---|---|---|---|
| Logistic Regression | 0.972791 | 0.775695 | 0.778405 | 0.775695 | 0.774395 |
| XGBoost | 0.911376 | 0.752116 | 0.759716 | 0.752116 | 0.750135 |
| Multinomial Naive Bayes | 0.884944 | 0.752116 | 0.752587 | 0.752116 | 0.751365 |
| Bernoulli Naive Bayes | 0.824825 | 0.724305 | 0.748688 | 0.724305 | 0.717429 |
| Random Forest Classifier | 0.995076 | 0.780532 | 0.783301 | 0.780532 | 0.779816 |
| Neural Network | 0.944545 | 0.745466 | 0.794425 | 0.745466 | 0.735720 |
Final_result.plot(kind='bar')
plt.legend(loc = 'lower right')
<matplotlib.legend.Legend at 0x140268610>
from imblearn.under_sampling import RandomUnderSampler
underresampler = RandomUnderSampler(random_state=0)
X_train_undersampled, y_train_undersampled = underresampler.fit_resample(X_train, y_train)
sns.countplot(x=y_train_undersampled)
<Axes: xlabel='Sentiment_Encoded', ylabel='count'>
print('Original dataset shape %s' % Counter(y_train))
print('Resampled dataset shape %s' % Counter(y_train_undersampled))
Original dataset shape Counter({2: 1752, 0: 1111, 1: 996})
Resampled dataset shape Counter({0: 996, 1: 996, 2: 996})
start_lr_downsample = time.time()
model1_lr_downsample = LogisticRegression(max_iter = 1000, multi_class = "multinomial")
model1_lr_downsample.fit(X_train_undersampled, y_train_undersampled)
LogisticRegression(max_iter=1000, multi_class='multinomial')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000, multi_class='multinomial')
y_pred_train_lr_downsample = model1_lr_downsample.predict(X_train_undersampled)
y_pred_test_lr_downsample = model1_lr_downsample.predict(X_test)
report1_lr_downsample = classification_report(y_test, y_pred_test_lr_downsample)
print(f"Classification Report: \n{report1_lr_downsample}")
Classification Report:
precision recall f1-score support
0 0.75 0.75 0.75 496
1 0.63 0.79 0.70 411
2 0.81 0.70 0.75 747
accuracy 0.73 1654
macro avg 0.73 0.74 0.73 1654
weighted avg 0.75 0.73 0.74 1654
acc_train_lr_down = accuracy_score(y_train_undersampled, y_pred_train_lr_downsample)
acc_test_lr_down = accuracy_score(y_test, y_pred_test_lr_downsample)
prec_lr_down = precision_score(y_test, y_pred_test_lr_downsample, average='weighted')
recall_lr_down = recall_score(y_test, y_pred_test_lr_downsample, average='weighted')
f1_score_lr_down = f1_score(y_test, y_pred_test_lr_downsample, average='weighted')
print('Logistic Regression DownSampling Accuracy Train percentage is:', round(acc_train_lr_down*100,2))
print('Logistic Regression DownSampling Accuracy percentage is:', round(acc_test_lr_down*100,2))
print('Logistic Regression DownSampling Precision percentage is:', round(prec_lr_down*100,2))
print('Logistic Regression DownSampling Recall percentage is:', round(recall_lr_down*100,2))
print('Logistic Regression DownSampling F1-score percentage is:', round(f1_score_lr_down*100,2))
Logistic Regression DownSampling Accuracy Train percentage is: 98.09 Logistic Regression DownSampling Accuracy percentage is: 73.46 Logistic Regression DownSampling Precision percentage is: 74.57 Logistic Regression DownSampling Recall percentage is: 73.46 Logistic Regression DownSampling F1-score percentage is: 73.61
# Display the confusion matrix
conf_matrix1_lr_down = confusion_matrix(y_test, y_pred_test_lr_downsample)
display(conf_matrix1_lr_down)
array([[370, 57, 69],
[ 33, 324, 54],
[ 93, 133, 521]])
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix1_lr_down, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Logistic Regression Downsampling')
plt.show()
end_lr_downsample = time.time()
Training_time_lr_downsample = end_lr_downsample - start_lr_downsample
print("The time of execution of Logistic Regression Downsampling Model is :",
Training_time_lr_downsample , "ms")
The time of execution of Logistic Regression Downsampling Model is : 6.733539819717407 ms
start_xg_down = time.time()
model2_xgb_down = XGBClassifier(random_state=0, n_estimators=100, max_depth=20, min_samples_split=2, min_samples_leaf=1)
model2_xgb_down = model2_xgb_down.fit(X_train_undersampled, y_train_undersampled)
y_pred_train_xgb_down = model2_xgb_down.predict(X_train_undersampled)
y_pred_test_xgb_down = model2_xgb_down.predict(X_test)
report2_xgb_down = classification_report(y_test, y_pred_test_xgb_down)
print(f"Classification Report: \n{report2_xgb_down}")
Classification Report:
precision recall f1-score support
0 0.76 0.70 0.73 496
1 0.56 0.82 0.67 411
2 0.80 0.64 0.71 747
accuracy 0.70 1654
macro avg 0.71 0.72 0.70 1654
weighted avg 0.73 0.70 0.71 1654
acc_train_xgb_down = accuracy_score(y_train_undersampled, y_pred_train_xgb_down)
acc_test_xgb_down = accuracy_score(y_test, y_pred_test_xgb_down)
prec_xgb_down = precision_score(y_test, y_pred_test_xgb_down, average='weighted')
recall_xgb_down = recall_score(y_test, y_pred_test_xgb_down, average='weighted')
f1_score_xgb_down = f1_score(y_test, y_pred_test_xgb_down, average='weighted')
print('XGBoost Classifier Downsampling Accuracy Train percentage is:', round(acc_train_xgb_down*100,2))
print('XGBoost Classifier Downsampling Accuracy percentage is:', round(acc_test_xgb_down*100,2))
print('XGBoost Classifier Downsampling Precision percentage is:', round(prec_xgb_down*100,2))
print('XGBoost Classifier Downsampling Recall percentage is:', round(recall_xgb_down*100,2))
print('XGBoost Classifier Downsampling F1-score percentage is:', round(f1_score_xgb_down*100,2))
XGBoost Classifier Downsampling Accuracy Train percentage is: 91.93 XGBoost Classifier Downsampling Accuracy percentage is: 70.37 XGBoost Classifier Downsampling Precision percentage is: 72.98 XGBoost Classifier Downsampling Recall percentage is: 70.37 XGBoost Classifier Downsampling F1-score percentage is: 70.64
# Display the confusion matrix
conf_matrix2_xgb_down = confusion_matrix(y_test, y_pred_test_xgb_down)
display(conf_matrix2_xgb_down)
array([[345, 79, 72],
[ 23, 339, 49],
[ 84, 183, 480]])
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix2_xgb_down, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for XGBoost Classifier Downsampling')
plt.show()
end_xg_down = time.time()
Training_time_xg_down = end_xg_down-start_xg_down
print("The time of execution of XGBoost DownSampling Classifier Model is :",
Training_time_xg_down , "ms")
The time of execution of XGBoost DownSampling Classifier Model is : 12.741549015045166 ms
start_mnb_down = time.time()
params = {'alpha': [0.01,0.1,0.5,1,10],}
model3_mnb_down = GridSearchCV(MultinomialNB(), param_grid=params, cv=5, verbose=10)
model3_mnb_down = model3_mnb_down.fit(X_train_undersampled, y_train_undersampled)
Fitting 5 folds for each of 5 candidates, totalling 25 fits [CV 1/5; 1/5] START alpha=0.01.................................................. [CV 1/5; 1/5] END ...................alpha=0.01;, score=0.701 total time= 0.5s [CV 2/5; 1/5] START alpha=0.01.................................................. [CV 2/5; 1/5] END ...................alpha=0.01;, score=0.691 total time= 0.4s [CV 3/5; 1/5] START alpha=0.01.................................................. [CV 3/5; 1/5] END ...................alpha=0.01;, score=0.676 total time= 0.4s [CV 4/5; 1/5] START alpha=0.01.................................................. [CV 4/5; 1/5] END ...................alpha=0.01;, score=0.682 total time= 0.4s [CV 5/5; 1/5] START alpha=0.01.................................................. [CV 5/5; 1/5] END ...................alpha=0.01;, score=0.677 total time= 0.4s [CV 1/5; 2/5] START alpha=0.1................................................... [CV 1/5; 2/5] END ....................alpha=0.1;, score=0.697 total time= 0.4s [CV 2/5; 2/5] START alpha=0.1................................................... [CV 2/5; 2/5] END ....................alpha=0.1;, score=0.689 total time= 0.5s [CV 3/5; 2/5] START alpha=0.1................................................... [CV 3/5; 2/5] END ....................alpha=0.1;, score=0.684 total time= 0.4s [CV 4/5; 2/5] START alpha=0.1................................................... [CV 4/5; 2/5] END ....................alpha=0.1;, score=0.688 total time= 0.4s [CV 5/5; 2/5] START alpha=0.1................................................... [CV 5/5; 2/5] END ....................alpha=0.1;, score=0.675 total time= 0.4s [CV 1/5; 3/5] START alpha=0.5................................................... [CV 1/5; 3/5] END ....................alpha=0.5;, score=0.699 total time= 0.4s [CV 2/5; 3/5] START alpha=0.5................................................... [CV 2/5; 3/5] END ....................alpha=0.5;, score=0.697 total time= 0.4s [CV 3/5; 3/5] START alpha=0.5................................................... [CV 3/5; 3/5] END ....................alpha=0.5;, score=0.682 total time= 0.4s [CV 4/5; 3/5] START alpha=0.5................................................... [CV 4/5; 3/5] END ....................alpha=0.5;, score=0.695 total time= 0.4s [CV 5/5; 3/5] START alpha=0.5................................................... [CV 5/5; 3/5] END ....................alpha=0.5;, score=0.675 total time= 0.4s [CV 1/5; 4/5] START alpha=1..................................................... [CV 1/5; 4/5] END ......................alpha=1;, score=0.682 total time= 0.4s [CV 2/5; 4/5] START alpha=1..................................................... [CV 2/5; 4/5] END ......................alpha=1;, score=0.702 total time= 0.4s [CV 3/5; 4/5] START alpha=1..................................................... [CV 3/5; 4/5] END ......................alpha=1;, score=0.687 total time= 0.4s [CV 4/5; 4/5] START alpha=1..................................................... [CV 4/5; 4/5] END ......................alpha=1;, score=0.697 total time= 0.4s [CV 5/5; 4/5] START alpha=1..................................................... [CV 5/5; 4/5] END ......................alpha=1;, score=0.675 total time= 0.4s [CV 1/5; 5/5] START alpha=10.................................................... [CV 1/5; 5/5] END .....................alpha=10;, score=0.661 total time= 0.4s [CV 2/5; 5/5] START alpha=10.................................................... [CV 2/5; 5/5] END .....................alpha=10;, score=0.659 total time= 0.4s [CV 3/5; 5/5] START alpha=10.................................................... [CV 3/5; 5/5] END .....................alpha=10;, score=0.671 total time= 0.4s [CV 4/5; 5/5] START alpha=10.................................................... [CV 4/5; 5/5] END .....................alpha=10;, score=0.670 total time= 0.4s [CV 5/5; 5/5] START alpha=10.................................................... [CV 5/5; 5/5] END .....................alpha=10;, score=0.657 total time= 0.4s
y_pred_train_mnb_down = model3_mnb_down.predict(X_train_undersampled)
y_pred_test_mnb_down = model3_mnb_down.predict(X_test)
report3_mnb_down = classification_report(y_test, y_pred_test_mnb_down)
print(f"Classification Report: \n{report3_mnb_down}")
Classification Report:
precision recall f1-score support
0 0.67 0.75 0.70 496
1 0.68 0.74 0.71 411
2 0.78 0.68 0.73 747
accuracy 0.71 1654
macro avg 0.71 0.72 0.71 1654
weighted avg 0.72 0.71 0.72 1654
print('Train Accuracy : %.3f'%model3_mnb_down.best_estimator_.score(X_train_undersampled, y_train_undersampled))
print('Test Accuracy : %.3f'%model3_mnb_down.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model3_mnb_down.best_score_)
print('Best Parameters : ',model3_mnb_down.best_params_)
Train Accuracy : 0.911
Test Accuracy : 0.715
Best Accuracy Through Grid Search : 0.690
Best Parameters : {'alpha': 0.5}
acc_train_mnb_down = accuracy_score(y_train_undersampled, y_pred_train_mnb_down)
acc_test_mnb_down = accuracy_score(y_test, y_pred_test_mnb_down)
prec_mnb_down = precision_score(y_test, y_pred_test_mnb_down, average='weighted')
recall_mnb_down = recall_score(y_test, y_pred_test_mnb_down, average='weighted')
f1_score_mnb_down = f1_score(y_test, y_pred_test_mnb_down, average='weighted')
print('Multinomial Naive Bayes DownSampling Accuracy Train percentage is:', round(acc_train_mnb_down*100,2))
print('Multinomial Naive Bayes DownSampling Accuracy percentage is:', round(acc_test_mnb_down*100,2))
print('Multinomial Naive Bayes DownSampling Precision percentage is:', round(prec_mnb_down*100,2))
print('Multinomial Naive Bayes DownSampling Recall percentage is:', round(recall_mnb_down*100,2))
print('Multinomial Naive Bayes DownSampling F1-score percentage is:', round(f1_score_mnb_down*100,2))
Multinomial Naive Bayes DownSampling Accuracy Train percentage is: 91.1 Multinomial Naive Bayes DownSampling Accuracy percentage is: 71.46 Multinomial Naive Bayes DownSampling Precision percentage is: 72.07 Multinomial Naive Bayes DownSampling Recall percentage is: 71.46 Multinomial Naive Bayes DownSampling F1-score percentage is: 71.52
# Display the confusion matrix
conf_matrix3_mnb_down = confusion_matrix(y_test, y_pred_test_mnb_down)
display(conf_matrix3_mnb_down)
array([[371, 42, 83],
[ 46, 303, 62],
[140, 99, 508]])
# Display the confusion matrix
conf_matrix3_mnb_down = confusion_matrix(y_test, y_pred_test_mnb_down)
display(conf_matrix3_mnb_down)
array([[371, 42, 83],
[ 46, 303, 62],
[140, 99, 508]])
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix3_mnb_down, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Multinomial Naive Bayes DownSampling')
plt.show()
end_mnb_down = time.time()
Training_time_mnb_down = end_mnb_down-start_mnb_down
print("The time of execution of Multinomial Naive Bayes DownSampling Model is :",
Training_time_mnb_down , "ms")
The time of execution of Multinomial Naive Bayes DownSampling Model is : 12.120550155639648 ms
start_bnb_down = time.time()
model4_bnb_down = BernoulliNB()
model4_bnb_down = model4_bnb_down.fit(X_train_undersampled, y_train_undersampled)
model4_bnb_down
BernoulliNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
BernoulliNB()
y_pred_train_bnb_down = model4_bnb_down.predict(X_train_undersampled)
y_pred_test_bnb_down = model4_bnb_down.predict(X_test)
report4_bnb_down = classification_report(y_test, y_pred_test_bnb_down)
print(f"Classification Report: \n{report4_bnb_down}")
Classification Report:
precision recall f1-score support
0 0.71 0.71 0.71 496
1 0.58 0.81 0.68 411
2 0.79 0.62 0.70 747
accuracy 0.70 1654
macro avg 0.69 0.71 0.69 1654
weighted avg 0.72 0.70 0.70 1654
acc_train_bnb_down = accuracy_score(y_train_undersampled, y_pred_train_bnb_down)
acc_test_bnb_down = accuracy_score(y_test, y_pred_test_bnb_down)
prec_bnb_down = precision_score(y_test, y_pred_test_bnb_down, average='weighted')
recall_bnb_down = recall_score(y_test, y_pred_test_bnb_down, average='weighted')
f1_score_bnb_down = f1_score(y_test, y_pred_test_bnb_down, average='weighted')
print('Bernoulli Naive Bayes DownSampling Accuracy Train percentage is:', round(acc_train_bnb_down*100,2))
print('Bernoulli Naive Bayes DownSampling Accuracy percentage is:', round(acc_test_bnb_down*100,2))
print('Bernoulli Naive Bayes DownSampling Precision percentage is:', round(prec_bnb_down*100,2))
print('Bernoulli Naive Bayes DownSampling Recall percentage is:', round(recall_bnb_down*100,2))
print('Bernoulli Naive Bayes DownSampling F1-score percentage is:', round(f1_score_bnb_down*100,2))
Bernoulli Naive Bayes DownSampling Accuracy Train percentage is: 89.52 Bernoulli Naive Bayes DownSampling Accuracy percentage is: 69.53 Bernoulli Naive Bayes DownSampling Precision percentage is: 71.53 Bernoulli Naive Bayes DownSampling Recall percentage is: 69.53 Bernoulli Naive Bayes DownSampling F1-score percentage is: 69.62
# Display the confusion matrix
conf_matrix4_bnb_down = confusion_matrix(y_test, y_pred_test_bnb_down)
display(conf_matrix4_bnb_down)
array([[350, 70, 76],
[ 31, 334, 46],
[114, 167, 466]])
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix4_bnb_down, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Bernoulli Naive Bayes')
plt.show()
end_bnb_down = time.time()
Training_time_bnb_down = end_bnb_down - start_bnb_down
print("The time of execution of Bernoulli Naive Bayes DownSampling Model is :",
Training_time_bnb_down , "ms")
The time of execution of Bernoulli Naive Bayes DownSampling Model is : 1.8181569576263428 ms
start_rfc_down = time.time()
model5_rfc_down = RandomForestClassifier(n_estimators=100, random_state=0)
model5_rfc_down = model5_rfc_down.fit(X_train_undersampled,y_train_undersampled)
model5_rfc_down
RandomForestClassifier(random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=0)
y_pred_train_rfc_down = model5_rfc_down.predict(X_train_undersampled)
y_pred_test_rfc_down = model5_rfc_down.predict(X_test)
report5_rfc_down = classification_report(y_test, y_pred_test_rfc_down)
print(f"Classification Report: \n{report5_rfc_down}")
Classification Report:
precision recall f1-score support
0 0.78 0.74 0.76 496
1 0.57 0.85 0.68 411
2 0.82 0.63 0.71 747
accuracy 0.72 1654
macro avg 0.72 0.74 0.72 1654
weighted avg 0.75 0.72 0.72 1654
acc_train_rfc_down = accuracy_score(y_train_undersampled, y_pred_train_rfc_down)
acc_test_rfc_down = accuracy_score(y_test, y_pred_test_rfc_down)
prec_rfc_down = precision_score(y_test, y_pred_test_rfc_down, average='weighted')
recall_rfc_down = recall_score(y_test, y_pred_test_rfc_down, average='weighted')
f1_score_rfc_down = f1_score(y_test, y_pred_test_rfc_down, average='weighted')
print('Random Forest Classifier DownSampling Accuracy Train percentage is:', round(acc_train_rfc_down*100,2))
print('Random Forest Classifier DownSampling Accuracy percentage is:', round(acc_test_rfc_down*100,2))
print('Random Forest Classifier DownSampling Precision percentage is:', round(prec_rfc_down*100,2))
print('Random Forest Classifier DownSampling Recall percentage is:', round(recall_rfc_down*100,2))
print('Random Forest Classifier DownSampling F1-score percentage is:', round(f1_score_rfc_down*100,2))
Random Forest Classifier DownSampling Accuracy Train percentage is: 99.56 Random Forest Classifier DownSampling Accuracy percentage is: 71.64 Random Forest Classifier DownSampling Precision percentage is: 74.67 Random Forest Classifier DownSampling Recall percentage is: 71.64 Random Forest Classifier DownSampling F1-score percentage is: 71.84
# Display the confusion matrix
conf_matrix5_rfc_down = confusion_matrix(y_test, y_pred_test_rfc_down)
display(conf_matrix5_rfc_down)
array([[367, 69, 60],
[ 20, 351, 40],
[ 86, 194, 467]])
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix5_rfc_down, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()
end_rfc_down = time.time()
Training_time_rfc_down = end_rfc_down - start_rfc_down
print("The time of execution of Random Forest Classifier DownSampling Model is :",
Training_time_rfc_down , "ms")
The time of execution of Random Forest Classifier DownSampling Model is : 27.15565586090088 ms
model2 = models.Sequential()
# Input - Layer
model2.add(layers.Dense(100, activation = "relu", input_shape=(8000, )))
model2.add(tf.keras.layers.Flatten()) #Flattening the layer
model2.add(tf.keras.layers.BatchNormalization())
# Hidden - Layers
model2.add(tf.keras.layers.Dense(100, activation='relu'))
model2.add(Dropout(0.5)) #Dropout of 50% in each hidden layer
model2.add(tf.keras.layers.Dense(100, activation='relu'))
model2.add(Dropout(0.5)) #Dropout of 50% in each hidden layer
# Output- Layer
model2.add(layers.Dense(3, activation = "softmax"))
model2.summary()
Model: "sequential_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ dense_4 (Dense) │ (None, 100) │ 800,100 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ flatten_1 (Flatten) │ (None, 100) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ batch_normalization_1 │ (None, 100) │ 400 │ │ (BatchNormalization) │ │ │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_5 (Dense) │ (None, 100) │ 10,100 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_2 (Dropout) │ (None, 100) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_6 (Dense) │ (None, 100) │ 10,100 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_3 (Dropout) │ (None, 100) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_7 (Dense) │ (None, 3) │ 303 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 821,003 (3.13 MB)
Trainable params: 820,803 (3.13 MB)
Non-trainable params: 200 (800.00 B)
optimize_down = keras.optimizers.Adam(learning_rate=0.001)
model2.compile(optimizer=optimize_down, loss=tf.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=['accuracy'])
from keras import callbacks
earlystopping = callbacks.EarlyStopping(monitor="val_loss",
mode="min",
patience=10,
restore_best_weights=True)
start_down = time.perf_counter()
results_nn_down = model2.fit(
X_train_undersampled, y_train_undersampled,
epochs= 150,
batch_size = 512,
validation_data = (X_test, y_test),
verbose=True,
callbacks=[earlystopping]
)
elapsed_time_nn_down = time.perf_counter() - start_down
print("Elapsed time for Neural Network Down Sampling is %.2f seconds." %elapsed_time_nn_down)
Epoch 1/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 3s 227ms/step - accuracy: 0.3377 - loss: 1.3050 - val_accuracy: 0.4807 - val_loss: 1.0907 Epoch 2/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 76ms/step - accuracy: 0.4469 - loss: 1.0757 - val_accuracy: 0.5121 - val_loss: 1.0867 Epoch 3/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 83ms/step - accuracy: 0.5399 - loss: 0.9481 - val_accuracy: 0.5568 - val_loss: 1.0843 Epoch 4/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 66ms/step - accuracy: 0.5958 - loss: 0.8544 - val_accuracy: 0.5804 - val_loss: 1.0802 Epoch 5/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 89ms/step - accuracy: 0.7142 - loss: 0.6997 - val_accuracy: 0.6082 - val_loss: 1.0707 Epoch 6/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 101ms/step - accuracy: 0.7701 - loss: 0.6035 - val_accuracy: 0.6493 - val_loss: 1.0529 Epoch 7/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 68ms/step - accuracy: 0.8445 - loss: 0.4546 - val_accuracy: 0.6808 - val_loss: 1.0277 Epoch 8/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 66ms/step - accuracy: 0.8873 - loss: 0.3533 - val_accuracy: 0.6929 - val_loss: 0.9974 Epoch 9/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 66ms/step - accuracy: 0.9198 - loss: 0.2645 - val_accuracy: 0.7025 - val_loss: 0.9668 Epoch 10/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 63ms/step - accuracy: 0.9490 - loss: 0.1774 - val_accuracy: 0.7140 - val_loss: 0.9346 Epoch 11/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 121ms/step - accuracy: 0.9637 - loss: 0.1330 - val_accuracy: 0.7304 - val_loss: 0.8987 Epoch 12/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 83ms/step - accuracy: 0.9768 - loss: 0.0903 - val_accuracy: 0.7291 - val_loss: 0.8686 Epoch 13/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 80ms/step - accuracy: 0.9854 - loss: 0.0676 - val_accuracy: 0.7273 - val_loss: 0.8443 Epoch 14/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 84ms/step - accuracy: 0.9856 - loss: 0.0541 - val_accuracy: 0.7237 - val_loss: 0.8224 Epoch 15/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 80ms/step - accuracy: 0.9870 - loss: 0.0492 - val_accuracy: 0.7225 - val_loss: 0.8011 Epoch 16/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 83ms/step - accuracy: 0.9870 - loss: 0.0400 - val_accuracy: 0.7304 - val_loss: 0.7815 Epoch 17/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 105ms/step - accuracy: 0.9887 - loss: 0.0361 - val_accuracy: 0.7334 - val_loss: 0.7677 Epoch 18/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 152ms/step - accuracy: 0.9887 - loss: 0.0322 - val_accuracy: 0.7267 - val_loss: 0.7578 Epoch 19/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 121ms/step - accuracy: 0.9906 - loss: 0.0335 - val_accuracy: 0.7261 - val_loss: 0.7463 Epoch 20/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 148ms/step - accuracy: 0.9927 - loss: 0.0249 - val_accuracy: 0.7358 - val_loss: 0.7338 Epoch 21/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 86ms/step - accuracy: 0.9919 - loss: 0.0251 - val_accuracy: 0.7376 - val_loss: 0.7237 Epoch 22/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 100ms/step - accuracy: 0.9899 - loss: 0.0244 - val_accuracy: 0.7424 - val_loss: 0.7135 Epoch 23/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 113ms/step - accuracy: 0.9942 - loss: 0.0206 - val_accuracy: 0.7400 - val_loss: 0.7054 Epoch 24/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 100ms/step - accuracy: 0.9921 - loss: 0.0311 - val_accuracy: 0.7249 - val_loss: 0.7029 Epoch 25/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 87ms/step - accuracy: 0.9929 - loss: 0.0233 - val_accuracy: 0.7273 - val_loss: 0.6966 Epoch 26/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 88ms/step - accuracy: 0.9938 - loss: 0.0236 - val_accuracy: 0.7237 - val_loss: 0.6921 Epoch 27/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 96ms/step - accuracy: 0.9945 - loss: 0.0164 - val_accuracy: 0.7340 - val_loss: 0.6841 Epoch 28/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 103ms/step - accuracy: 0.9919 - loss: 0.0201 - val_accuracy: 0.7370 - val_loss: 0.6750 Epoch 29/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 86ms/step - accuracy: 0.9955 - loss: 0.0189 - val_accuracy: 0.7340 - val_loss: 0.6706 Epoch 30/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 89ms/step - accuracy: 0.9938 - loss: 0.0199 - val_accuracy: 0.7328 - val_loss: 0.6681 Epoch 31/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 90ms/step - accuracy: 0.9955 - loss: 0.0146 - val_accuracy: 0.7316 - val_loss: 0.6635 Epoch 32/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 92ms/step - accuracy: 0.9940 - loss: 0.0190 - val_accuracy: 0.7279 - val_loss: 0.6594 Epoch 33/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 83ms/step - accuracy: 0.9950 - loss: 0.0137 - val_accuracy: 0.7255 - val_loss: 0.6562 Epoch 34/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 95ms/step - accuracy: 0.9955 - loss: 0.0144 - val_accuracy: 0.7237 - val_loss: 0.6557 Epoch 35/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 84ms/step - accuracy: 0.9947 - loss: 0.0169 - val_accuracy: 0.7195 - val_loss: 0.6553 Epoch 36/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 80ms/step - accuracy: 0.9951 - loss: 0.0136 - val_accuracy: 0.7219 - val_loss: 0.6512 Epoch 37/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 102ms/step - accuracy: 0.9969 - loss: 0.0092 - val_accuracy: 0.7267 - val_loss: 0.6486 Epoch 38/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 66ms/step - accuracy: 0.9936 - loss: 0.0184 - val_accuracy: 0.7304 - val_loss: 0.6452 Epoch 39/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 81ms/step - accuracy: 0.9959 - loss: 0.0122 - val_accuracy: 0.7322 - val_loss: 0.6451 Epoch 40/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 71ms/step - accuracy: 0.9969 - loss: 0.0119 - val_accuracy: 0.7316 - val_loss: 0.6480 Epoch 41/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 85ms/step - accuracy: 0.9960 - loss: 0.0127 - val_accuracy: 0.7231 - val_loss: 0.6518 Epoch 42/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 76ms/step - accuracy: 0.9935 - loss: 0.0179 - val_accuracy: 0.7255 - val_loss: 0.6529 Epoch 43/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 86ms/step - accuracy: 0.9942 - loss: 0.0140 - val_accuracy: 0.7225 - val_loss: 0.6546 Epoch 44/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 78ms/step - accuracy: 0.9957 - loss: 0.0143 - val_accuracy: 0.7261 - val_loss: 0.6540 Epoch 45/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 81ms/step - accuracy: 0.9954 - loss: 0.0122 - val_accuracy: 0.7316 - val_loss: 0.6538 Epoch 46/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 115ms/step - accuracy: 0.9954 - loss: 0.0130 - val_accuracy: 0.7322 - val_loss: 0.6573 Epoch 47/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 80ms/step - accuracy: 0.9934 - loss: 0.0158 - val_accuracy: 0.7267 - val_loss: 0.6658 Epoch 48/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 93ms/step - accuracy: 0.9957 - loss: 0.0105 - val_accuracy: 0.7291 - val_loss: 0.6714 Epoch 49/150 6/6 ━━━━━━━━━━━━━━━━━━━━ 1s 93ms/step - accuracy: 0.9949 - loss: 0.0136 - val_accuracy: 0.7279 - val_loss: 0.6789 Elapsed time for Neural Network Down Sampling is 32.92 seconds.
hist_df2 = pd.DataFrame(results_nn_down.history)
hist_df2.loc[:, ['loss','val_loss']].plot()
hist_df2.loc[:, ['accuracy','val_accuracy']].plot()
plt.show()
y_pred_train_nn_down = model2.predict(X_train_undersampled)
y_pred_train_nn_down = y_pred_train_nn_down.argmax(axis=1)
94/94 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step
y_pred_test_nn_down = model2.predict(X_test)
y_pred_test_nn_down = y_pred_test_nn_down.argmax(axis=1)
52/52 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step
cm_nn_down = confusion_matrix(y_test, y_pred_test_nn_down)
cm_nn_down
array([[380, 33, 83],
[ 41, 300, 70],
[128, 88, 531]])
ax = sns.heatmap(cm_nn_down, cmap='flare', annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted Class", fontsize=12)
plt.ylabel("True Class",fontsize=12)
plt.title("Confusion Matrix", fontsize=12)
plt.figure(figsize=(5,3))
plt.show()
<Figure size 500x300 with 0 Axes>
print(classification_report(y_test, y_pred_test_nn_down))
precision recall f1-score support
0 0.69 0.77 0.73 496
1 0.71 0.73 0.72 411
2 0.78 0.71 0.74 747
accuracy 0.73 1654
macro avg 0.73 0.74 0.73 1654
weighted avg 0.74 0.73 0.73 1654
acc_train_nn_down = accuracy_score(y_train_undersampled, y_pred_train_nn_down)
acc_test_nn_down = accuracy_score(y_test, y_pred_test_nn_down)
prec_nn_down = precision_score(y_test, y_pred_test_nn_down, average='weighted')
recall_nn_down = recall_score(y_test, y_pred_test_nn_down, average='weighted')
f1_score_nn_down = f1_score(y_test, y_pred_test_nn_down, average='weighted')
print('Neural Network Under Sampling Accuracy Train percentage is:', round(acc_train_nn_down*100,2))
print('Neural Network Under Sampling Accuracyp Test Percentage is:', round(acc_test_nn_down*100,2))
print('Neural Network Under Sampling Precision percentage is:', round(prec_nn_down*100,2))
print('Neural Network Under Sampling Recall percentage is:', round(recall_nn_down*100,2))
print('Neural Network Under Sampling F1-score percentage is:', round(f1_score_nn_down*100,2))
Neural Network Under Sampling Accuracy Train percentage is: 99.46 Neural Network Under Sampling Accuracyp Test Percentage is: 73.22 Neural Network Under Sampling Precision percentage is: 73.52 Neural Network Under Sampling Recall percentage is: 73.22 Neural Network Under Sampling F1-score percentage is: 73.25
print("The time of execution of Recurrent Neural Network Model is :",
elapsed_time_nn_down , "ms")
The time of execution of Recurrent Neural Network Model is : 32.924465653999505 ms
pd.set_option('display.width',1000)
df2 = pd.DataFrame([
['Logistic Regression', round(acc_train_lr_down*100,2), round(acc_test_lr_down*100,2), round(prec_lr_down*100,2), round(recall_lr_down*100,2), round(f1_score_lr_down*100,2)],
['XGBoost', round(acc_train_xgb_down*100,2), round(acc_test_xgb_down*100,2), round(prec_xgb_down*100,2), round(recall_xgb_down*100,2), round(f1_score_xgb_down*100,2)],
['Multinomial Naive Bayes', round(acc_train_mnb_down*100,2), round(acc_test_mnb_down*100,2), round(prec_mnb_down*100,2), round(recall_mnb_down*100,2), round(f1_score_mnb_down*100,2)],
['Bernoulli Naive Bayes', round(acc_train_bnb_down*100,2), round(acc_test_bnb_down*100,2), round(prec_bnb_down*100,2), round(recall_bnb_down*100,2), round(f1_score_bnb_down*100,2)],
['Random Forest Classifier', round(acc_train_rfc_down*100,2), round(acc_test_rfc_down*100,2), round(prec_rfc_down*100,2), round(recall_rfc_down*100,2), round(f1_score_rfc_down*100,2)],
['Neural Network', round(acc_train_nn_down*100,2), round(acc_test_nn_down*100,2), round(prec_nn_down*100,2), round(recall_nn_down*100,2), round(f1_score_nn_down*100,2)]
],
columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'f1-Score']
)
print("Performance metrics for Global warming climate change Tweets Down Sampling(%)")
print(df2)
Performance metrics for Global warming climate change Tweets Down Sampling(%)
Model Train Accuracy Test Accuracy Precision Recall f1-Score
0 Logistic Regression 98.09 73.46 74.57 73.46 73.61
1 XGBoost 91.93 70.37 72.98 70.37 70.64
2 Multinomial Naive Bayes 91.10 71.46 72.07 71.46 71.52
3 Bernoulli Naive Bayes 89.52 69.53 71.53 69.53 69.62
4 Random Forest Classifier 99.56 71.64 74.67 71.64 71.84
5 Neural Network 99.46 73.22 73.52 73.22 73.25
# plot data in stack manner of bar type
b = df2.plot(x='Model', kind='barh', stacked=True,figsize = (8,6), xlabel='Metrics Percentage', ylabel = 'Models',
title='Performance metrics Under Sampling(%)')
for c2 in b.containers:
#labels = [v.get_height() if v.get_height() > 0 else '' for v in c]
b.bar_label(c2, label_type = 'center')
plt.legend(loc='upper right', bbox_to_anchor=(0.6, 0., 0.6, 0.6))
plt.xticks(rotation=90)
plt.show()
metrics_down = [[round(acc_train_lr_down*100,2),round(acc_test_lr_down*100,2),round(prec_lr_down*100,2),round(recall_lr_down*100,2),round(f1_score_lr_down*100,2)],
[round(acc_train_xgb_down*100,2),round(acc_test_xgb_down*100,2),round(prec_xgb_down*100,2),round(recall_xgb_down*100,2),round(f1_score_xgb_down*100,2)],
[round(acc_train_mnb_down*100,2),round(acc_test_mnb_down*100,2),round(prec_mnb_down*100,2),round(recall_mnb_down*100,2),round(f1_score_mnb_down*100,2)],
[round(acc_train_bnb_down*100,2),round(acc_test_bnb_down*100,2),round(prec_bnb_down*100,2),round(recall_bnb_down*100,2),round(f1_score_bnb_down*100,2)],
[round(acc_train_rfc_down*100,2),round(acc_test_rfc_down*100,2),round(prec_rfc_down*100,2),round(recall_rfc_down*100,2),round(f1_score_rfc_down*100,2)],
[round(acc_train_nn_down*100,2),round(acc_test_nn_down*100,2),round(prec_nn_down*100,2),round(recall_nn_down*100,2),round(f1_score_nn_down*100,2)]
]
Final_result_down = pd.DataFrame(metrics_down, columns=['Train Accuracy', 'Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1 Score'],
index=['Logistic Regression','XGBoost','Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'Random Forest Classifier', 'Neural Network'])
Final_result_down.style.background_gradient(cmap='prism', axis=1)
| Train Accuracy | Test Accuracy | Test Precision | Test Recall | Test F1 Score | |
|---|---|---|---|---|---|
| Logistic Regression | 98.090000 | 73.460000 | 74.570000 | 73.460000 | 73.610000 |
| XGBoost | 91.930000 | 70.370000 | 72.980000 | 70.370000 | 70.640000 |
| Multinomial Naive Bayes | 91.100000 | 71.460000 | 72.070000 | 71.460000 | 71.520000 |
| Bernoulli Naive Bayes | 89.520000 | 69.530000 | 71.530000 | 69.530000 | 69.620000 |
| Random Forest Classifier | 99.560000 | 71.640000 | 74.670000 | 71.640000 | 71.840000 |
| Neural Network | 99.460000 | 73.220000 | 73.520000 | 73.220000 | 73.250000 |
Final_result_down.plot(kind='bar')
plt.legend(loc = 'lower right')
<matplotlib.legend.Legend at 0x142644210>
from imblearn.over_sampling import SMOTE
!pip install -U threadpoolctl
Requirement already satisfied: threadpoolctl in /opt/anaconda3/lib/python3.12/site-packages (3.5.0) [notice] A new release of pip is available: 24.2 -> 24.3.1 [notice] To update, run: pip install --upgrade pip
upsampling = SMOTE(random_state=0)
X_train_upsampling, y_train_upsampling = upsampling.fit_resample(X_train, y_train)
sns.countplot(x = y_train_upsampling)
<Axes: xlabel='Sentiment_Encoded', ylabel='count'>
print('Original dataset shape %s' % Counter(y_train))
print('Resampled dataset shape %s' % Counter(y_train_upsampling))
Original dataset shape Counter({2: 1752, 0: 1111, 1: 996})
Resampled dataset shape Counter({1: 1752, 2: 1752, 0: 1752})
start_lr_upsample = time.time()
model1_lr_upsample = LogisticRegression(max_iter = 1000, multi_class = "multinomial")
model1_lr_upsample.fit(X_train_upsampling, y_train_upsampling)
LogisticRegression(max_iter=1000, multi_class='multinomial')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000, multi_class='multinomial')
y_pred_train_lr_upsample = model1_lr_upsample.predict(X_train_upsampling)
y_pred_test_lr_upsample = model1_lr_upsample.predict(X_test)
report1_lr_upsample = classification_report(y_test, y_pred_test_lr_upsample)
print(f"Classification Report: \n{report1_lr_upsample}")
Classification Report:
precision recall f1-score support
0 0.75 0.75 0.75 496
1 0.70 0.78 0.73 411
2 0.81 0.76 0.79 747
accuracy 0.76 1654
macro avg 0.75 0.76 0.76 1654
weighted avg 0.77 0.76 0.76 1654
acc_train_lr_up = accuracy_score(y_train_upsampling, y_pred_train_lr_upsample)
acc_test_lr_up = accuracy_score(y_test, y_pred_test_lr_upsample)
prec_lr_up = precision_score(y_test, y_pred_test_lr_upsample, average='weighted')
recall_lr_up = recall_score(y_test, y_pred_test_lr_upsample, average='weighted')
f1_score_lr_up = f1_score(y_test, y_pred_test_lr_upsample, average='weighted')
print('Logistic Regression UpSampling Accuracy Train percentage is:', round(acc_train_lr_up*100,2))
print('Logistic Regression UpSampling Accuracy percentage is:', round(acc_test_lr_up*100,2))
print('Logistic Regression UpSampling Precision percentage is:', round(prec_lr_up*100,2))
print('Logistic Regression UpSampling Recall percentage is:', round(recall_lr_up*100,2))
print('Logistic Regression UpSampling F1-score percentage is:', round(f1_score_lr_up*100,2))
Logistic Regression UpSampling Accuracy Train percentage is: 90.16 Logistic Regression UpSampling Accuracy percentage is: 76.18 Logistic Regression UpSampling Precision percentage is: 76.51 Logistic Regression UpSampling Recall percentage is: 76.18 Logistic Regression UpSampling F1-score percentage is: 76.25
# Display the confusion matrix
conf_matrix1_lr_up = confusion_matrix(y_test, y_pred_test_lr_upsample)
display(conf_matrix1_lr_up)
array([[372, 50, 74],
[ 33, 320, 58],
[ 89, 90, 568]])
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix1_lr_up, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Logistic Regression Upsampling')
plt.show()
end_lr_upsample = time.time()
Training_time_lr_upsample = end_lr_upsample - start_lr_upsample
y_proba_lr_UpScaled = model1_lr_upsample.predict_proba(X_test)
from sklearn.multiclass import OneVsRestClassifier
clf_lr_UpScaled = OneVsRestClassifier(LogisticRegression(max_iter = 1000, multi_class = "multinomial"))
clf_lr_UpScaled.fit(X_train_upsampling,y_train_upsampling)
OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000,
multi_class='multinomial'))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000,
multi_class='multinomial'))LogisticRegression(max_iter=1000, multi_class='multinomial')
LogisticRegression(max_iter=1000, multi_class='multinomial')
pred_lr_UpScaled = clf_lr_UpScaled.predict(X_test)
pred_prob_lr_UpScaled = clf_lr_UpScaled.predict_proba(X_test)
fpr_lr_up={}
tpr_lr_up={}
thresh_lr_up={}
n_classes_lr_up = 3
for i in range(n_classes_lr_up):
fpr_lr_up[i],tpr_lr_up[i],thresh_lr_up[i] = roc_curve(y_test, pred_prob_lr_UpScaled[:,i], pos_label=i)
#Plotting
plt.plot(fpr_lr_up[0],tpr_lr_up[0],linestyle='--',color='orange',label='Class 0 vs Rest')
plt.plot(fpr_lr_up[1],tpr_lr_up[1],linestyle='--',color='green',label='Class 1 vs Rest')
plt.plot(fpr_lr_up[2],tpr_lr_up[2],linestyle='--',color='blue',label='Class 2 vs Rest')
plt.title('Multiclass ROC curve for Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x1426e4e90>
fpr1_lr_up, tpr1_lr_up, thresh1_lr_up = roc_curve(y_test, y_proba_lr_UpScaled[:,1], pos_label=1)
random_probs_lr_UpScaled = [0 for i in range(len(y_test))]
p_fpr_lr_UpScaled, p_tpr_lr_UpScaled, _ = roc_curve(y_test,random_probs_lr_UpScaled,pos_label=1)
auc_score1_lr_UpScaled = roc_auc_score(y_test, y_proba_lr_UpScaled, multi_class='ovr')
print(auc_score1_lr_UpScaled)
0.8963176339575553
plt.plot(fpr1_lr_up, tpr1_lr_up, linestyle='--',color='orange', label='Logistic Regression UpScaled AUC Score is : {:.4f}'.format(auc_score1_lr_UpScaled))
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True positive Rate')
plt.legend(loc='best')
plt.show()
print("The time of execution of Logistic Regression Upsampling Model is :",
Training_time_lr_upsample , "ms")
The time of execution of Logistic Regression Upsampling Model is : 12.018002986907959 ms
start_xg_up = time.time()
model2_xgb_up = XGBClassifier(random_state=42, n_estimators=100, max_depth=20, min_samples_split=2, min_samples_leaf=1)
model2_xgb_up = model2_xgb_up.fit(X_train_upsampling, y_train_upsampling)
model2_xgb_up
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=20, max_leaves=None,
min_child_weight=None, min_samples_leaf=1, min_samples_split=2,
missing=nan, monotone_constraints=None, multi_strategy=None,
n_estimators=100, n_jobs=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=20, max_leaves=None,
min_child_weight=None, min_samples_leaf=1, min_samples_split=2,
missing=nan, monotone_constraints=None, multi_strategy=None,
n_estimators=100, n_jobs=None, ...)y_pred_train_xgb_up = model2_xgb_up.predict(X_train_upsampling)
y_pred_test_xgb_up = model2_xgb_up.predict(X_test)
report2_xgb_up = classification_report(y_test, y_pred_test_xgb_up)
print(f"Classification Report: \n{report2_xgb_up}")
Classification Report:
precision recall f1-score support
0 0.75 0.70 0.72 496
1 0.61 0.79 0.69 411
2 0.78 0.69 0.73 747
accuracy 0.71 1654
macro avg 0.71 0.72 0.71 1654
weighted avg 0.73 0.71 0.72 1654
acc_train_xgb_up = accuracy_score(y_train_upsampling, y_pred_train_xgb_up)
acc_test_xgb_up = accuracy_score(y_test, y_pred_test_xgb_up)
prec_xgb_up = precision_score(y_test, y_pred_test_xgb_up, average='weighted')
recall_xgb_up = recall_score(y_test, y_pred_test_xgb_up, average='weighted')
f1_score_xgb_up = f1_score(y_test, y_pred_test_xgb_up, average='weighted')
print('XGBoost Classifier Upsampling Accuracy Train percentage is:', round(acc_train_xgb_up*100,2))
print('XGBoost Classifier Upsampling Accuracy percentage is:', round(acc_test_xgb_up*100,2))
print('XGBoost Classifier Upsampling Precision percentage is:', round(prec_xgb_up*100,2))
print('XGBoost Classifier Upsampling Recall percentage is:', round(recall_xgb_up*100,2))
print('XGBoost Classifier Upsampling F1-score percentage is:', round(f1_score_xgb_up*100,2))
XGBoost Classifier Upsampling Accuracy Train percentage is: 85.58 XGBoost Classifier Upsampling Accuracy percentage is: 71.4 XGBoost Classifier Upsampling Precision percentage is: 72.59 XGBoost Classifier Upsampling Recall percentage is: 71.4 XGBoost Classifier Upsampling F1-score percentage is: 71.56
# Display the confusion matrix
conf_matrix2_xgb_up = confusion_matrix(y_test, y_pred_test_xgb_up)
display(conf_matrix2_xgb_up)
array([[345, 63, 88],
[ 30, 323, 58],
[ 88, 146, 513]])
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix2_xgb_up, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for XGBoost Classifier Upsampling')
plt.show()
end_xg_up = time.time()
Training_time_xg_up = end_xg_up - start_xg_up
print("The time of execution of XGBoost UpSampling Classifier Model is :",
Training_time_xg_up , "ms")
The time of execution of XGBoost UpSampling Classifier Model is : 18.946184158325195 ms
start_mnb_up = time.time()
params = {'alpha': [0.01,0.1,0.5,1,10],}
model3_mnb_up = GridSearchCV(MultinomialNB(), param_grid=params, cv=5, verbose=10)
model3_mnb_up = model3_mnb_up.fit(X_train_upsampling,y_train_upsampling)
Fitting 5 folds for each of 5 candidates, totalling 25 fits [CV 1/5; 1/5] START alpha=0.01.................................................. [CV 1/5; 1/5] END ...................alpha=0.01;, score=0.708 total time= 1.3s [CV 2/5; 1/5] START alpha=0.01.................................................. [CV 2/5; 1/5] END ...................alpha=0.01;, score=0.738 total time= 1.1s [CV 3/5; 1/5] START alpha=0.01.................................................. [CV 3/5; 1/5] END ...................alpha=0.01;, score=0.676 total time= 1.0s [CV 4/5; 1/5] START alpha=0.01.................................................. [CV 4/5; 1/5] END ...................alpha=0.01;, score=0.682 total time= 1.0s [CV 5/5; 1/5] START alpha=0.01.................................................. [CV 5/5; 1/5] END ...................alpha=0.01;, score=0.706 total time= 1.0s [CV 1/5; 2/5] START alpha=0.1................................................... [CV 1/5; 2/5] END ....................alpha=0.1;, score=0.706 total time= 1.0s [CV 2/5; 2/5] START alpha=0.1................................................... [CV 2/5; 2/5] END ....................alpha=0.1;, score=0.751 total time= 1.0s [CV 3/5; 2/5] START alpha=0.1................................................... [CV 3/5; 2/5] END ....................alpha=0.1;, score=0.673 total time= 1.0s [CV 4/5; 2/5] START alpha=0.1................................................... [CV 4/5; 2/5] END ....................alpha=0.1;, score=0.680 total time= 1.0s [CV 5/5; 2/5] START alpha=0.1................................................... [CV 5/5; 2/5] END ....................alpha=0.1;, score=0.703 total time= 1.0s [CV 1/5; 3/5] START alpha=0.5................................................... [CV 1/5; 3/5] END ....................alpha=0.5;, score=0.708 total time= 1.3s [CV 2/5; 3/5] START alpha=0.5................................................... [CV 2/5; 3/5] END ....................alpha=0.5;, score=0.754 total time= 1.1s [CV 3/5; 3/5] START alpha=0.5................................................... [CV 3/5; 3/5] END ....................alpha=0.5;, score=0.669 total time= 1.0s [CV 4/5; 3/5] START alpha=0.5................................................... [CV 4/5; 3/5] END ....................alpha=0.5;, score=0.674 total time= 1.0s [CV 5/5; 3/5] START alpha=0.5................................................... [CV 5/5; 3/5] END ....................alpha=0.5;, score=0.700 total time= 1.0s [CV 1/5; 4/5] START alpha=1..................................................... [CV 1/5; 4/5] END ......................alpha=1;, score=0.707 total time= 1.0s [CV 2/5; 4/5] START alpha=1..................................................... [CV 2/5; 4/5] END ......................alpha=1;, score=0.752 total time= 1.0s [CV 3/5; 4/5] START alpha=1..................................................... [CV 3/5; 4/5] END ......................alpha=1;, score=0.663 total time= 1.0s [CV 4/5; 4/5] START alpha=1..................................................... [CV 4/5; 4/5] END ......................alpha=1;, score=0.670 total time= 1.0s [CV 5/5; 4/5] START alpha=1..................................................... [CV 5/5; 4/5] END ......................alpha=1;, score=0.695 total time= 1.0s [CV 1/5; 5/5] START alpha=10.................................................... [CV 1/5; 5/5] END .....................alpha=10;, score=0.673 total time= 1.0s [CV 2/5; 5/5] START alpha=10.................................................... [CV 2/5; 5/5] END .....................alpha=10;, score=0.719 total time= 1.0s [CV 3/5; 5/5] START alpha=10.................................................... [CV 3/5; 5/5] END .....................alpha=10;, score=0.656 total time= 1.0s [CV 4/5; 5/5] START alpha=10.................................................... [CV 4/5; 5/5] END .....................alpha=10;, score=0.665 total time= 1.0s [CV 5/5; 5/5] START alpha=10.................................................... [CV 5/5; 5/5] END .....................alpha=10;, score=0.691 total time= 1.0s
y_pred_train_mnb_up = model3_mnb_up.predict(X_train_upsampling)
y_pred_test_mnb_up = model3_mnb_up.predict(X_test)
report3_mnb_up = classification_report(y_test, y_pred_test_xgb_up)
print(f"Classification Report: \n{report3_mnb_up}")
Classification Report:
precision recall f1-score support
0 0.75 0.70 0.72 496
1 0.61 0.79 0.69 411
2 0.78 0.69 0.73 747
accuracy 0.71 1654
macro avg 0.71 0.72 0.71 1654
weighted avg 0.73 0.71 0.72 1654
print('Train Accuracy : %.3f'%model3_mnb_up.best_estimator_.score(X_train_upsampling, y_train_upsampling))
print('Test Accuracy : %.3f'%model3_mnb_up.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model3_mnb_up.best_score_)
print('Best Parameters : ',model3_mnb_up.best_params_)
Train Accuracy : 0.844
Test Accuracy : 0.748
Best Accuracy Through Grid Search : 0.703
Best Parameters : {'alpha': 0.1}
acc_train_mnb_up = accuracy_score(y_train_upsampling, y_pred_train_mnb_up)
acc_test_mnb_up = accuracy_score(y_test, y_pred_test_mnb_up)
prec_mnb_up = precision_score(y_test, y_pred_test_mnb_up, average='weighted')
recall_mnb_up = recall_score(y_test, y_pred_test_mnb_up, average='weighted')
f1_score_mnb_up = f1_score(y_test, y_pred_test_mnb_up, average='weighted')
print('Multinomial Naive Bayes UpSampling Accuracy Train percentage is:', round(acc_train_mnb_up*100,2))
print('Multinomial Naive Bayes UpSampling Accuracy percentage is:', round(acc_test_mnb_up*100,2))
print('Multinomial Naive Bayes UpSampling Precision percentage is:', round(prec_mnb_up*100,2))
print('Multinomial Naive Bayes UpSampling Recall percentage is:', round(recall_mnb_up*100,2))
print('Multinomial Naive Bayes UpSampling F1-score percentage is:', round(f1_score_mnb_up*100,2))
Multinomial Naive Bayes UpSampling Accuracy Train percentage is: 84.4 Multinomial Naive Bayes UpSampling Accuracy percentage is: 74.79 Multinomial Naive Bayes UpSampling Precision percentage is: 74.81 Multinomial Naive Bayes UpSampling Recall percentage is: 74.79 Multinomial Naive Bayes UpSampling F1-score percentage is: 74.77
# Display the confusion matrix
conf_matrix3_mnb_up = confusion_matrix(y_test, y_pred_test_mnb_up)
display(conf_matrix3_mnb_up)
array([[364, 37, 95],
[ 37, 290, 84],
[107, 57, 583]])
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix3_mnb_up, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Multinomial Naive Bayes UpSampling')
plt.show()
end_mnb_up = time.time()
Training_time_mnb_up = end_mnb_up - start_mnb_up
print("The time of execution of Multinomial Naive Bayes UpSampling Model is :",
Training_time_mnb_up , "ms")
The time of execution of Multinomial Naive Bayes UpSampling Model is : 28.228438138961792 ms
## Bernoulli NAIVE BAYES OVERSAMPLING
start_bnb_up = time.time()
model4_bnb_up = BernoulliNB()
model4_bnb_up = model4_bnb_up.fit(X_train_upsampling,y_train_upsampling)
model4_bnb_up
BernoulliNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
BernoulliNB()
y_pred_train_bnb_up = model4_bnb_up.predict(X_train_upsampling)
y_pred_test_bnb_up = model4_bnb_up.predict(X_test)
report4_bnb_up = classification_report(y_test, y_pred_test_bnb_up)
print(f"Classification Report: \n{report4_bnb_up}")
Classification Report:
precision recall f1-score support
0 0.69 0.75 0.72 496
1 0.59 0.81 0.69 411
2 0.84 0.62 0.71 747
accuracy 0.71 1654
macro avg 0.71 0.73 0.70 1654
weighted avg 0.73 0.71 0.71 1654
acc_train_bnb_up = accuracy_score(y_train_upsampling, y_pred_train_bnb_up)
acc_test_bnb_up = accuracy_score(y_test, y_pred_test_bnb_up)
prec_bnb_up = precision_score(y_test, y_pred_test_bnb_up, average='weighted')
recall_bnb_up = recall_score(y_test, y_pred_test_bnb_up, average='weighted')
f1_score_bnb_up = f1_score(y_test, y_pred_test_bnb_up, average='weighted')
print('Bernoulli Naive Bayes UpSampling Accuracy Train percentage is:', round(acc_train_bnb_up*100,2))
print('Bernoulli Naive Bayes UpSampling Accuracy percentage is:', round(acc_test_bnb_up*100,2))
print('Bernoulli Naive Bayes UpSampling Precision percentage is:', round(prec_bnb_up*100,2))
print('Bernoulli Naive Bayesn UpSampling Recall percentage is:', round(recall_bnb_up*100,2))
print('Bernoulli Naive Bayes UpSampling F1-score percentage is:', round(f1_score_bnb_up*100,2))
Bernoulli Naive Bayes UpSampling Accuracy Train percentage is: 80.27 Bernoulli Naive Bayes UpSampling Accuracy percentage is: 70.56 Bernoulli Naive Bayes UpSampling Precision percentage is: 73.2 Bernoulli Naive Bayesn UpSampling Recall percentage is: 70.56 Bernoulli Naive Bayes UpSampling F1-score percentage is: 70.66
# Display the confusion matrix
conf_matrix4_bnb_up = confusion_matrix(y_test, y_pred_test_bnb_up)
display(conf_matrix4_bnb_up)
array([[373, 66, 57],
[ 46, 333, 32],
[124, 162, 461]])
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix4_bnb_up, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Bernoulli Naive Bayes')
plt.show()
end_bnb_up = time.time()
Training_time_bnb_up = end_bnb_up - start_bnb_up
print("The time of execution of Bernoulli Naive Bayes UpSampling Model is :",
Training_time_bnb_down , "ms")
The time of execution of Bernoulli Naive Bayes UpSampling Model is : 1.8181569576263428 ms
start_rfc_up = time.time()
model5_rfc_up = RandomForestClassifier(n_estimators=100, random_state=0)
model5_rfc_up = model5_rfc_up.fit(X_train_upsampling,y_train_upsampling)
model5_rfc_up
RandomForestClassifier(random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=0)
y_pred_train_rfc_up = model5_rfc_up.predict(X_train_upsampling)
y_pred_test_rfc_up = model5_rfc_up.predict(X_test)
report5_rfc_up = classification_report(y_test, y_pred_test_rfc_up)
print(f"Classification Report: \n{report5_rfc_up}")
Classification Report:
precision recall f1-score support
0 0.74 0.75 0.75 496
1 0.66 0.80 0.73 411
2 0.81 0.71 0.76 747
accuracy 0.75 1654
macro avg 0.74 0.76 0.74 1654
weighted avg 0.75 0.75 0.75 1654
acc_train_rfc_up = accuracy_score(y_train_upsampling, y_pred_train_rfc_up)
acc_test_rfc_up = accuracy_score(y_test, y_pred_test_rfc_up)
prec_rfc_up = precision_score(y_test, y_pred_test_rfc_up, average='weighted')
recall_rfc_up = recall_score(y_test, y_pred_test_rfc_up, average='weighted')
f1_score_rfc_up = f1_score(y_test, y_pred_test_rfc_up, average='weighted')
print('Random Forest Classifier UpSampling Accuracy Train percentage is:', round(acc_train_rfc_up*100,2))
print('Random Forest Classifier UpSampling Accuracy percentage is:', round(acc_test_rfc_up*100,2))
print('Random Forest Classifier UpSampling Precision percentage is:', round(prec_rfc_up*100,2))
print('Random Forest Classifier UpSampling Recall percentage is:', round(recall_rfc_up*100,2))
print('Random Forest Classifier UpSampling F1-score percentage is:', round(f1_score_rfc_up*100,2))
Random Forest Classifier UpSampling Accuracy Train percentage is: 92.45 Random Forest Classifier UpSampling Accuracy percentage is: 74.67 Random Forest Classifier UpSampling Precision percentage is: 75.44 Random Forest Classifier UpSampling Recall percentage is: 74.67 Random Forest Classifier UpSampling F1-score percentage is: 74.75
# Display the confusion matrix
conf_matrix5_gnb_up = confusion_matrix(y_test, y_pred_test_rfc_up)
display(conf_matrix5_gnb_up)
array([[373, 44, 79],
[ 37, 329, 45],
[ 92, 122, 533]])
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix5_gnb_up, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()
end_rfc_up = time.time()
Training_time_rfc_up = end_rfc_up - start_rfc_up
print("The time of execution of Random Forest Classifier UpSampling Model is :",
Training_time_rfc_up , "ms")
The time of execution of Random Forest Classifier UpSampling Model is : 56.391887187957764 ms
y_proba_rfc_UpScaled = model5_rfc_up.predict_proba(X_test)
clf_rfc_UpScaled = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=0))
clf_rfc_UpScaled.fit(X_train_upsampling,y_train_upsampling)
OneVsRestClassifier(estimator=RandomForestClassifier(random_state=0))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
OneVsRestClassifier(estimator=RandomForestClassifier(random_state=0))
RandomForestClassifier(random_state=0)
RandomForestClassifier(random_state=0)
pred_rfc_UpScaled = clf_rfc_UpScaled.predict(X_test)
pred_prob_rfc_UpScaled = clf_rfc_UpScaled.predict_proba(X_test)
fpr_rfc_up={}
tpr_rfc_up={}
thresh_rfc_up={}
n_classes_rfc_up = 3
for i in range(n_classes_rfc_up):
fpr_rfc_up[i],tpr_rfc_up[i],thresh_rfc_up[i] = roc_curve(y_test, pred_prob_rfc_UpScaled[:,i], pos_label=i)
#Plotting
plt.plot(fpr_rfc_up[0],tpr_rfc_up[0],linestyle='--',color='orange',label='Class 0 vs Rest')
plt.plot(fpr_rfc_up[1],tpr_rfc_up[1],linestyle='--',color='green',label='Class 1 vs Rest')
plt.plot(fpr_rfc_up[2],tpr_rfc_up[2],linestyle='--',color='blue',label='Class 2 vs Rest')
plt.title('Multiclass ROC curve for Random Forest UpScaling Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x140814e90>
fpr1_rfc_up, tpr1_rfc_up, thresh1_rfc_up = roc_curve(y_test, y_proba_rfc_UpScaled[:,1], pos_label=1)
random_probs_rfc_up = [0 for i in range(len(y_test))]
p_fpr_rfc_up, p_tpr_rfc_up, _ = roc_curve(y_test,random_probs_rfc_up,pos_label=1)
auc_score_rfc_up = roc_auc_score(y_test, y_proba_rfc_UpScaled, multi_class='ovr')
print(auc_score_rfc_up)
0.8892071938469984
plt.plot(fpr1_rfc_up, tpr1_rfc_up, linestyle='--',color='orange', label='Random Forest Classifier UpSampling AUC Score is {:.4f}'.format(auc_score_rfc_up))
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True positive Rate')
plt.legend(loc='best')
plt.show()
model3 = models.Sequential()
# Input - Layer
model3.add(layers.Dense(100, activation = "relu", input_shape=(8000, )))
model3.add(tf.keras.layers.Flatten()) #Flattening the layer
model3.add(tf.keras.layers.BatchNormalization())
# Hidden - Layers
model3.add(tf.keras.layers.Dense(100, activation='relu'))
model3.add(Dropout(0.5)) #Dropout of 50% in each hidden layer
model3.add(tf.keras.layers.Dense(100, activation='relu'))
model3.add(Dropout(0.5)) #Dropout of 50% in each hidden layer
# Output- Layer
model3.add(layers.Dense(3, activation = "softmax"))
model3.summary()
Model: "sequential_2"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ dense_8 (Dense) │ (None, 100) │ 800,100 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ flatten_2 (Flatten) │ (None, 100) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ batch_normalization_2 │ (None, 100) │ 400 │ │ (BatchNormalization) │ │ │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_9 (Dense) │ (None, 100) │ 10,100 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_4 (Dropout) │ (None, 100) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_10 (Dense) │ (None, 100) │ 10,100 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_5 (Dropout) │ (None, 100) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_11 (Dense) │ (None, 3) │ 303 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 821,003 (3.13 MB)
Trainable params: 820,803 (3.13 MB)
Non-trainable params: 200 (800.00 B)
optimize_up = keras.optimizers.Adam(learning_rate=0.001)
model3.compile(optimizer=optimize_up, loss=tf.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=['accuracy'])
from keras import callbacks
earlystopping = callbacks.EarlyStopping(monitor="val_loss",
mode="min",
patience=10,
restore_best_weights=True)
start_over = time.perf_counter()
results_nn_up = model3.fit(
X_train_upsampling,y_train_upsampling,
epochs= 150,
batch_size = 512,
validation_data = (X_test, y_test),
verbose=True,
callbacks=[earlystopping]
)
elapsed_time_nn_over = time.perf_counter() - start_over
print("Elapsed time for Neural Network over sampling is %.2f seconds." %elapsed_time_nn_over)
Epoch 1/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 2s 83ms/step - accuracy: 0.3563 - loss: 1.2039 - val_accuracy: 0.3821 - val_loss: 1.0937 Epoch 2/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 65ms/step - accuracy: 0.5005 - loss: 0.9778 - val_accuracy: 0.4105 - val_loss: 1.0898 Epoch 3/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 63ms/step - accuracy: 0.6009 - loss: 0.8500 - val_accuracy: 0.5097 - val_loss: 1.0764 Epoch 4/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 65ms/step - accuracy: 0.6874 - loss: 0.7047 - val_accuracy: 0.6270 - val_loss: 1.0486 Epoch 5/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 64ms/step - accuracy: 0.7705 - loss: 0.5630 - val_accuracy: 0.6312 - val_loss: 1.0055 Epoch 6/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 65ms/step - accuracy: 0.8269 - loss: 0.4145 - val_accuracy: 0.5919 - val_loss: 0.9508 Epoch 7/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 65ms/step - accuracy: 0.8678 - loss: 0.3083 - val_accuracy: 0.6058 - val_loss: 0.9035 Epoch 8/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 66ms/step - accuracy: 0.8862 - loss: 0.2602 - val_accuracy: 0.6197 - val_loss: 0.8664 Epoch 9/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 68ms/step - accuracy: 0.9006 - loss: 0.2147 - val_accuracy: 0.6372 - val_loss: 0.8365 Epoch 10/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 76ms/step - accuracy: 0.9034 - loss: 0.2002 - val_accuracy: 0.6318 - val_loss: 0.8205 Epoch 11/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 81ms/step - accuracy: 0.9113 - loss: 0.1776 - val_accuracy: 0.6608 - val_loss: 0.7958 Epoch 12/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 85ms/step - accuracy: 0.9132 - loss: 0.1707 - val_accuracy: 0.6856 - val_loss: 0.7723 Epoch 13/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 88ms/step - accuracy: 0.9233 - loss: 0.1565 - val_accuracy: 0.6953 - val_loss: 0.7538 Epoch 14/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 61ms/step - accuracy: 0.9127 - loss: 0.1649 - val_accuracy: 0.7074 - val_loss: 0.7337 Epoch 15/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 86ms/step - accuracy: 0.9174 - loss: 0.1508 - val_accuracy: 0.7092 - val_loss: 0.7268 Epoch 16/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 100ms/step - accuracy: 0.9188 - loss: 0.1439 - val_accuracy: 0.7310 - val_loss: 0.7078 Epoch 17/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 86ms/step - accuracy: 0.9234 - loss: 0.1512 - val_accuracy: 0.7297 - val_loss: 0.6961 Epoch 18/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 85ms/step - accuracy: 0.9120 - loss: 0.1490 - val_accuracy: 0.7406 - val_loss: 0.6806 Epoch 19/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 97ms/step - accuracy: 0.9197 - loss: 0.1396 - val_accuracy: 0.7509 - val_loss: 0.6669 Epoch 20/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 106ms/step - accuracy: 0.9208 - loss: 0.1431 - val_accuracy: 0.7503 - val_loss: 0.6612 Epoch 21/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 114ms/step - accuracy: 0.9185 - loss: 0.1391 - val_accuracy: 0.7539 - val_loss: 0.6550 Epoch 22/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 109ms/step - accuracy: 0.9179 - loss: 0.1467 - val_accuracy: 0.7563 - val_loss: 0.6536 Epoch 23/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 114ms/step - accuracy: 0.9157 - loss: 0.1509 - val_accuracy: 0.7527 - val_loss: 0.6506 Epoch 24/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 98ms/step - accuracy: 0.9148 - loss: 0.1458 - val_accuracy: 0.7648 - val_loss: 0.6459 Epoch 25/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 90ms/step - accuracy: 0.9259 - loss: 0.1315 - val_accuracy: 0.7624 - val_loss: 0.6549 Epoch 26/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 90ms/step - accuracy: 0.9141 - loss: 0.1427 - val_accuracy: 0.7606 - val_loss: 0.6630 Epoch 27/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 105ms/step - accuracy: 0.9222 - loss: 0.1377 - val_accuracy: 0.7624 - val_loss: 0.6608 Epoch 28/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 95ms/step - accuracy: 0.9164 - loss: 0.1447 - val_accuracy: 0.7672 - val_loss: 0.6706 Epoch 29/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 94ms/step - accuracy: 0.9210 - loss: 0.1399 - val_accuracy: 0.7678 - val_loss: 0.6833 Epoch 30/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 83ms/step - accuracy: 0.9259 - loss: 0.1320 - val_accuracy: 0.7630 - val_loss: 0.6996 Epoch 31/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 87ms/step - accuracy: 0.9261 - loss: 0.1306 - val_accuracy: 0.7636 - val_loss: 0.7157 Epoch 32/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 83ms/step - accuracy: 0.9142 - loss: 0.1411 - val_accuracy: 0.7630 - val_loss: 0.7333 Epoch 33/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 77ms/step - accuracy: 0.9241 - loss: 0.1276 - val_accuracy: 0.7624 - val_loss: 0.7568 Epoch 34/150 11/11 ━━━━━━━━━━━━━━━━━━━━ 1s 87ms/step - accuracy: 0.9156 - loss: 0.1425 - val_accuracy: 0.7648 - val_loss: 0.7773 Elapsed time for Neural Network over sampling is 36.81 seconds.
hist_df3 = pd.DataFrame(results_nn_up.history)
hist_df3.loc[:, ['loss','val_loss']].plot()
hist_df3.loc[:, ['accuracy','val_accuracy']].plot()
plt.show()
y_pred_train_nn_up = model3.predict(X_train_upsampling)
y_pred_train_nn_up = y_pred_train_nn_up.argmax(axis=1)
165/165 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step
y_pred_test_nn_up = model3.predict(X_test)
y_pred_test_nn_up = y_pred_test_nn_up.argmax(axis=1)
52/52 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
cm_nn_up = confusion_matrix(y_test, y_pred_test_nn_up)
cm_nn_up
array([[336, 24, 136],
[ 13, 260, 138],
[ 44, 34, 669]])
ax = sns.heatmap(cm_nn_up, annot=True, fmt='d', cmap='flare', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted Class", fontsize=12)
plt.ylabel("True Class",fontsize=12)
plt.title("Confusion Matrix", fontsize=12)
plt.show()
acc_train_nn_up = accuracy_score(y_train_upsampling, y_pred_train_nn_up)
acc_test_nn_up = accuracy_score(y_test, y_pred_test_nn_up)
prec_nn_up = precision_score(y_test, y_pred_test_nn_up, average='weighted')
recall_nn_up = recall_score(y_test, y_pred_test_nn_up, average='weighted')
f1_score_nn_up = f1_score(y_test, y_pred_test_nn_up, average='weighted')
print('Neural Network Over Sampling Accuracy Train percentage is:', round(acc_train_nn_up*100,2))
print('Neural Network Over Sampling Accuracy Test Percentage is:', round(acc_test_nn_up*100,2))
print('Neural Network Over Sampling Precision percentage is:', round(prec_nn_up*100,2))
print('Neural Network Over Sampling Recall percentage is:', round(recall_nn_up*100,2))
print('Neural Network Over Sampling F1-score percentage is:', round(f1_score_nn_up*100,2))
Neural Network Over Sampling Accuracy Train percentage is: 83.81 Neural Network Over Sampling Accuracy Test Percentage is: 76.48 Neural Network Over Sampling Precision percentage is: 78.0 Neural Network Over Sampling Recall percentage is: 76.48 Neural Network Over Sampling F1-score percentage is: 76.15
print("The time of execution of Recurrent Neural Network Model is :",
elapsed_time_nn_over , "ms")
The time of execution of Recurrent Neural Network Model is : 36.80833590600014 ms
pd.set_option('display.width',1000)
df3 = pd.DataFrame([
['Logistic Regression', round(acc_train_lr_up*100,2), round(acc_test_lr_up*100,2), round(prec_lr_up*100,2), round(recall_lr_up*100,2), round(f1_score_lr_up*100,2)],
['XGBoost', round(acc_train_xgb_up*100,2), round(acc_test_xgb_up*100,2), round(prec_xgb_up*100,2), round(recall_xgb_up*100,2), round(f1_score_xgb_up*100,2)],
['Multinomial Naive Bayes', round(acc_train_mnb_up*100,2), round(acc_test_mnb_up*100,2), round(prec_mnb_up*100,2), round(recall_mnb_up*100,2), round(f1_score_mnb_up*100,2)],
['Bernoulli Naive Bayes', round(acc_train_bnb_up*100,2), round(acc_test_bnb_up*100,2), round(prec_bnb_up*100,2), round(recall_bnb_up*100,2), round(f1_score_bnb_up*100,2)],
['Random Forest Classifier', round(acc_train_rfc_up*100,2), round(acc_test_rfc_up*100,2), round(prec_rfc_up*100,2), round(recall_rfc_up*100,2), round(f1_score_rfc_up*100,2)],
['Neural Network', round(acc_train_nn_up*100,2), round(acc_test_nn_up*100,2), round(prec_nn_up*100,2), round(recall_nn_up*100,2), round(f1_score_nn_up*100,2)]
],
columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'f1-Score']
)
print("Performance metrics for Global warming climate change Tweets Over Sampling SMOTE Technique(%)")
print(df3)
Performance metrics for Global warming climate change Tweets Over Sampling SMOTE Technique(%)
Model Train Accuracy Test Accuracy Precision Recall f1-Score
0 Logistic Regression 90.16 76.18 76.51 76.18 76.25
1 XGBoost 85.58 71.40 72.59 71.40 71.56
2 Multinomial Naive Bayes 84.40 74.79 74.81 74.79 74.77
3 Bernoulli Naive Bayes 80.27 70.56 73.20 70.56 70.66
4 Random Forest Classifier 92.45 74.67 75.44 74.67 74.75
5 Neural Network 83.81 76.48 78.00 76.48 76.15
# plot data in stack manner of bar type
c = df3.plot(x='Model', kind='barh', stacked=True,figsize = (8,6), xlabel='Metrics Percentage', ylabel = 'Models',
title='Performance Metrics Over Sampling(%)')
for c3 in c.containers:
#labels = [v.get_height() if v.get_height() > 0 else '' for v in c]
c.bar_label(c3, label_type = 'center')
plt.legend(loc='upper right', bbox_to_anchor=(0.6, 0., 0.6, 0.6))
plt.xticks(rotation=90)
plt.show()
metrics_up = [[round(acc_train_lr_up*100,2),round(acc_test_lr_up*100,2),round(prec_lr_up*100,2),round(recall_lr_up*100,2),round(f1_score_lr_up*100,2)],
[round(acc_train_xgb_up*100,2),round(acc_test_xgb_up*100,2),round(prec_xgb_up*100,2),round(recall_xgb_up*100,2),round(f1_score_xgb_up*100,2)],
[round(acc_train_mnb_up*100,2),round(acc_test_mnb_up*100,2),round(prec_mnb_up*100,2),round(recall_mnb_up*100,2),round(f1_score_mnb_up*100,2)],
[round(acc_train_bnb_up*100,2),round(acc_test_bnb_up*100,2),round(prec_bnb_up*100,2),round(recall_bnb_up*100,2),round(f1_score_bnb_up*100,2)],
[round(acc_train_rfc_up*100,2),round(acc_test_rfc_up*100,2),round(prec_rfc_up*100,2),round(recall_rfc_up*100,2),round(f1_score_rfc_up*100,2)],
[round(acc_train_nn_up*100,2),round(acc_test_nn_up*100,2),round(prec_nn_up*100,2),round(recall_nn_up*100,2),round(f1_score_nn_up*100,2)]
]
Final_result_up = pd.DataFrame(metrics_up, columns=['Train Accuracy', 'Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1 Score'],
index=['Logistic Regression','XGBoost','Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'Random Forest Classifier', 'Neural Network'])
Final_result_up.style.background_gradient(cmap = 'prism', axis=1)
| Train Accuracy | Test Accuracy | Test Precision | Test Recall | Test F1 Score | |
|---|---|---|---|---|---|
| Logistic Regression | 90.160000 | 76.180000 | 76.510000 | 76.180000 | 76.250000 |
| XGBoost | 85.580000 | 71.400000 | 72.590000 | 71.400000 | 71.560000 |
| Multinomial Naive Bayes | 84.400000 | 74.790000 | 74.810000 | 74.790000 | 74.770000 |
| Bernoulli Naive Bayes | 80.270000 | 70.560000 | 73.200000 | 70.560000 | 70.660000 |
| Random Forest Classifier | 92.450000 | 74.670000 | 75.440000 | 74.670000 | 74.750000 |
| Neural Network | 83.810000 | 76.480000 | 78.000000 | 76.480000 | 76.150000 |
Final_result_up.plot(kind='bar')
plt.legend(loc = 'lower right')
<matplotlib.legend.Legend at 0x1435c5550>
methods = ['Logistic Regression', 'Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'XGBoost', 'Random Forest', 'Neural Network']
accuracy_scores = [round(acc_test_lr*100,2), round(acc_test_mnb*100,2), round(acc_test_bnb*100,2), round(acc_test_xgb*100,2), round(acc_test_rfc*100,2), round(acc_test_nn*100,2)]
precision_scores = [round(prec_lr*100,2), round(prec_mnb*100,2), round(prec_bnb*100,2), round(prec_xgb*100,2), round(prec_rfc*100,2), round(prec_nn*100,2)]
recall_scores = [round(recall_lr*100,2), round(recall_mnb*100,2), round(recall_bnb*100,2), round(recall_xgb*100,2), round(recall_rfc*100,2), round(recall_nn*100,2)]
f1_scores = [round(f1_score_lr*100,2), round(f1_score_mnb*100,2), round(f1_score_bnb*100,2), round(f1_score_xgb*100,2), round(f1_score_rfc*100,2), round(f1_score_nn*100,2)]
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.bar(methods, accuracy_scores, color=['orchid', 'green', 'orange', 'lightpink', 'olive', 'lawngreen'])
plt.xticks(rotation = 45)
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison')
plt.subplot(1, 2, 2)
plt.bar(methods, precision_scores, color=['orchid', 'green', 'orange', 'lightpink', 'olive', 'lawngreen'])
plt.xticks(rotation = 45)
plt.ylabel('Precision Score')
plt.title('Precision Comparison')
plt.figure(figsize=(10, 4))
plt.subplot(2, 2, 1)
plt.bar(methods, recall_scores, color=['orchid', 'green', 'orange', 'lightpink', 'olive', 'lawngreen'])
plt.xticks(rotation = 45)
plt.ylabel('Recall Score')
plt.title('Recall Comparison')
plt.subplot(2, 2, 2)
plt.bar(methods, f1_scores, color=['orchid', 'green', 'orange', 'lightpink', 'olive', 'lawngreen'])
plt.xticks(rotation = 45)
plt.ylabel('F1 Score')
plt.title('F1 Score Comparison')
plt.show()
methods_down = ['Logistic Regression', 'Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'XGBoost', 'Random Forest', 'Neural Network']
accuracy_scores = [round(acc_test_lr_down*100,2), round(acc_test_mnb_down*100,2), round(acc_test_bnb_down*100,2), round(acc_test_xgb_down*100,2), round(acc_test_rfc_down*100,2), round(acc_test_nn_down*100,2)]
precision_scores = [round(prec_lr_down*100,2), round(prec_mnb_down*100,2), round(prec_bnb_down*100,2), round(prec_xgb_down*100,2), round(prec_rfc_down*100,2), round(prec_nn_down*100,2)]
recall_scores = [round(recall_lr_down*100,2), round(recall_mnb_down*100,2), round(recall_bnb_down*100,2), round(recall_xgb_down*100,2), round(recall_rfc_down*100,2), round(recall_nn_down*100,2)]
f1_scores = [round(f1_score_lr_down*100,2), round(f1_score_mnb_down*100,2), round(f1_score_bnb_down*100,2), round(f1_score_xgb_down*100,2), round(f1_score_rfc_down*100,2), round(f1_score_nn_down*100,2)]
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.bar(methods_down, accuracy_scores, color=['darksalmon', 'mistyrose', 'teal', 'chartreuse', 'khaki', 'fuchsia'])
plt.xticks(rotation = 45)
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison')
plt.subplot(1, 2, 2)
plt.bar(methods_down, precision_scores, color=['darksalmon', 'mistyrose', 'teal', 'chartreuse', 'khaki', 'fuchsia'])
plt.xticks(rotation = 45)
plt.ylabel('Precision Score')
plt.title('Precision Comparison')
plt.figure(figsize=(10, 4))
plt.subplot(2, 2, 1)
plt.bar(methods_down, recall_scores, color=['darksalmon', 'mistyrose', 'teal', 'chartreuse', 'khaki', 'fuchsia'])
plt.xticks(rotation = 45)
plt.ylabel('Recall Score')
plt.title('Recall Comparison')
plt.subplot(2, 2, 2)
plt.bar(methods_down, f1_scores, color=['darksalmon', 'mistyrose', 'teal', 'chartreuse', 'khaki', 'fuchsia'])
plt.xticks(rotation = 45)
plt.ylabel('F1 Score')
plt.title('F1 Score Comparison')
plt.show()
methods_up = ['Logistic Regression', 'Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'XGBoost', 'Random Forest', 'Neural Network']
accuracy_scores = [round(acc_test_lr_up*100,2), round(acc_test_mnb_up*100,2), round(acc_test_bnb_up*100,2), round(acc_test_xgb_up*100,2), round(acc_test_rfc_up*100,2), round(acc_test_nn_up*100,2)]
precision_scores = [round(prec_lr_up*100,2), round(prec_mnb_up*100,2), round(prec_bnb_up*100,2), round(prec_xgb_up*100,2), round(prec_rfc_up*100,2), round(prec_nn_up*100,2)]
recall_scores = [round(recall_lr_up*100,2), round(recall_mnb_up*100,2), round(recall_bnb_up*100,2), round(recall_xgb_up*100,2), round(recall_rfc_up*100,2), round(recall_nn_up*100,2)]
f1_scores = [round(f1_score_lr_up*100,2), round(f1_score_mnb_up*100,2), round(f1_score_bnb_up*100,2), round(f1_score_xgb_up*100,2), round(f1_score_rfc_up*100,2), round(f1_score_nn_up*100,2)]
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.bar(methods_up, accuracy_scores, color=['orchid', 'green', 'orange', 'lightpink', 'olive', 'aqua'])
plt.xticks(rotation = 45)
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison')
plt.subplot(1, 2, 2)
plt.bar(methods_up, precision_scores, color=['orchid', 'green', 'orange', 'lightpink', 'olive', 'aqua'])
plt.xticks(rotation = 45)
plt.ylabel('Precision Score')
plt.title('Precision Comparison')
plt.figure(figsize=(10, 4))
plt.subplot(2, 2, 1)
plt.bar(methods_up, recall_scores, color=['orchid', 'green', 'orange', 'lightpink', 'olive', 'aqua'])
plt.xticks(rotation = 45)
plt.ylabel('Recall Score')
plt.title('Recall Comparison')
plt.subplot(2, 2, 2)
plt.bar(methods_up, f1_scores, color=['orchid', 'green', 'orange', 'lightpink', 'olive', 'aqua'])
plt.xticks(rotation = 45)
plt.ylabel('F1 Score')
plt.title('F1 Score Comparison')
plt.show()
metrics_accuracy= [[acc_test_lr,acc_test_lr_down,acc_test_lr_up],
[acc_test_xgb,acc_test_xgb_down,acc_test_xgb_up],
[acc_test_mnb,acc_test_mnb_down,acc_test_mnb_up],
[acc_test_bnb,acc_test_bnb_down,acc_test_bnb_up],
[acc_test_rfc,acc_test_rfc_down,acc_test_rfc_up],
[acc_test_nn,acc_test_nn_down,acc_test_nn_up]
]
Final_metrics_accuracy = pd.DataFrame(metrics_accuracy, columns=['Test Accuracy', 'Test Accuracy Downsampling', 'Test Accuracy Upsampling'],
index=['Logistic Regression','XGBoost','Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'Random Forest Classifier', 'Neural Network'])
Final_metrics_accuracy.style.background_gradient(cmap='prism')
| Test Accuracy | Test Accuracy Downsampling | Test Accuracy Upsampling | |
|---|---|---|---|
| Logistic Regression | 0.775695 | 0.734583 | 0.761790 |
| XGBoost | 0.752116 | 0.703748 | 0.714027 |
| Multinomial Naive Bayes | 0.752116 | 0.714631 | 0.747884 |
| Bernoulli Naive Bayes | 0.724305 | 0.695284 | 0.705562 |
| Random Forest Classifier | 0.780532 | 0.716445 | 0.746675 |
| Neural Network | 0.745466 | 0.732164 | 0.764813 |
Final_metrics_accuracy.plot(kind='bar')
plt.title('Accuracy Comparision for various Models')
plt.legend(loc = 'lower right')
<matplotlib.legend.Legend at 0x143724050>
metrics_precision= [[prec_lr,prec_lr_down,prec_lr_up],
[prec_xgb,prec_xgb_down,prec_xgb_up],
[prec_mnb,prec_mnb_down,prec_mnb_up],
[prec_bnb,prec_bnb_down,prec_bnb_up],
[prec_rfc,prec_rfc_down,prec_rfc_up],
[prec_nn,prec_nn_down,prec_nn_up]
]
Final_metrics_precision = pd.DataFrame(metrics_precision, columns=['Test Precision', 'Test Precision Downsampling', 'Test Precision Upsampling'],
index=['Logistic Regression','XGBoost','Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'Random Forest', 'Neural Network'])
Final_metrics_precision.style.background_gradient(cmap='prism')
| Test Precision | Test Precision Downsampling | Test Precision Upsampling | |
|---|---|---|---|
| Logistic Regression | 0.778405 | 0.745708 | 0.765149 |
| XGBoost | 0.759716 | 0.729757 | 0.725894 |
| Multinomial Naive Bayes | 0.752587 | 0.720663 | 0.748075 |
| Bernoulli Naive Bayes | 0.748688 | 0.715313 | 0.732043 |
| Random Forest | 0.783301 | 0.746706 | 0.754368 |
| Neural Network | 0.794425 | 0.735246 | 0.779957 |
Final_metrics_precision.plot(kind='bar')
plt.title('Precision Comparision for various Models')
plt.legend(loc = 'lower right')
<matplotlib.legend.Legend at 0x143c1cf10>
metrics_recall= [[recall_lr,recall_lr_down,recall_lr_up],
[recall_xgb,recall_xgb_down,recall_xgb_up],
[recall_mnb,recall_mnb_down,recall_mnb_up],
[recall_bnb,recall_bnb_down,recall_bnb_up],
[recall_rfc,recall_rfc_down,recall_rfc_up],
[recall_nn,recall_nn_down,recall_nn_up]
]
Final_metrics_recall = pd.DataFrame(metrics_recall, columns=['Test Recall', 'Test Recall Downsampling', 'Test Recall Upsampling'],
index=['Logistic Regression','XGBoost','Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'Random Forest Classifier', 'Neural Network'])
Final_metrics_recall.style.background_gradient(cmap='prism')
| Test Recall | Test Recall Downsampling | Test Recall Upsampling | |
|---|---|---|---|
| Logistic Regression | 0.775695 | 0.734583 | 0.761790 |
| XGBoost | 0.752116 | 0.703748 | 0.714027 |
| Multinomial Naive Bayes | 0.752116 | 0.714631 | 0.747884 |
| Bernoulli Naive Bayes | 0.724305 | 0.695284 | 0.705562 |
| Random Forest Classifier | 0.780532 | 0.716445 | 0.746675 |
| Neural Network | 0.745466 | 0.732164 | 0.764813 |
Final_metrics_recall.plot(kind='bar')
plt.title('Recall Comparision for various Models')
plt.legend(loc = 'lower right')
<matplotlib.legend.Legend at 0x143d21090>
metrics_f1= [[f1_score_lr,f1_score_lr_down,f1_score_lr_up],
[f1_score_xgb,f1_score_xgb_down,f1_score_xgb_up],
[f1_score_mnb,f1_score_mnb_down,f1_score_mnb_up],
[f1_score_bnb,f1_score_bnb_down,f1_score_bnb_up],
[f1_score_rfc,f1_score_rfc_down,f1_score_rfc_up],
[f1_score_nn,f1_score_nn_down,f1_score_nn_up]
]
Final_metrics_f1 = pd.DataFrame(metrics_f1, columns=['Test f1-score', 'Test f1-score Downsampling', 'Test f1-score Upsampling'],
index=['Logistic Regression','XGBoost','Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'Random Forest Classifier', 'Neural Network'])
Final_metrics_f1.style.background_gradient(cmap='prism')
| Test f1-score | Test f1-score Downsampling | Test f1-score Upsampling | |
|---|---|---|---|
| Logistic Regression | 0.774395 | 0.736095 | 0.762514 |
| XGBoost | 0.750135 | 0.706381 | 0.715559 |
| Multinomial Naive Bayes | 0.751365 | 0.715189 | 0.747704 |
| Bernoulli Naive Bayes | 0.717429 | 0.696152 | 0.706626 |
| Random Forest Classifier | 0.779816 | 0.718361 | 0.747534 |
| Neural Network | 0.735720 | 0.732466 | 0.761493 |
Final_metrics_f1.plot(kind='bar')
plt.title('f1-score Comparision for various Models')
plt.legend(loc = 'lower right')
<matplotlib.legend.Legend at 0x143db4390>
metrics_training_time= [[Training_time_lr,Training_time_lr_downsample,Training_time_lr_upsample],
[Training_time_xg,Training_time_xg_down,Training_time_xg_up],
[Training_time_mnb,Training_time_mnb_down,Training_time_mnb_up],
[Training_time_bnb,Training_time_bnb_down,Training_time_bnb_up],
[Training_time_rfc,Training_time_rfc_down,Training_time_rfc_up],
[elapsed_time_nn,elapsed_time_nn_down,elapsed_time_nn_over]
]
Final_metrics_training_time = pd.DataFrame(metrics_f1, columns=['Test Training Time', 'Test f1-Training Time Downsampling', 'Test Training Time Upsampling'],
index=['Logistic Regression','XGBoost','Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'Random Forest Classifier', 'Neural Network'])
Final_metrics_training_time.style.background_gradient(cmap='prism')
| Test Training Time | Test f1-Training Time Downsampling | Test Training Time Upsampling | |
|---|---|---|---|
| Logistic Regression | 0.774395 | 0.736095 | 0.762514 |
| XGBoost | 0.750135 | 0.706381 | 0.715559 |
| Multinomial Naive Bayes | 0.751365 | 0.715189 | 0.747704 |
| Bernoulli Naive Bayes | 0.717429 | 0.696152 | 0.706626 |
| Random Forest Classifier | 0.779816 | 0.718361 | 0.747534 |
| Neural Network | 0.735720 | 0.732466 | 0.761493 |
Final_metrics_training_time.plot(kind='barh')
plt.title('Training time Comparision for various Models')
plt.legend(loc = 'lower right')
<matplotlib.legend.Legend at 0x143e74390>
# Create a new text input to predict
new_text = ['Climate Change Increasing Poverty And Vulnerability']
# Vectorize the new text input using the same TF-IDF vectorizer
new_text_vectorized = cv.transform(new_text).toarray()
# Predict using the trained logistic regression model
predicted_label = model1_lr.predict(new_text_vectorized)
a = label_encoder.classes_
print("Predicted Label:", predicted_label)
print(mapping)
Predicted Label: [0]
{'Negative': 0, 'Neutral': 1, 'Positive': 2}
# Create a new text input to predict
new_text = ['Global warming is truly affecting the weather patterns of this planet negatively and causing destructive weather all over this world.']
# Vectorize the new text input using the same TF-IDF vectorizer
new_text_vectorized = cv.transform(new_text).toarray()
# Predict using the trained logistic regression model
predicted_label = model5_rfc.predict(new_text_vectorized)
a = label_encoder.classes_
print("Predicted Label:", predicted_label)
print(mapping)
Predicted Label: [2]
{'Negative': 0, 'Neutral': 1, 'Positive': 2}
# Create a new text input to predict
new_text = ['Global warming is truly affecting the weather patterns of this planet negatively and causing destructive weather all over this world.']
# Vectorize the new text input using the same TF-IDF vectorizer
new_text_vectorized = cv.transform(new_text).toarray()
# Predict using the trained logistic regression model
predicted_label = model.predict(new_text_vectorized)
a = label_encoder.classes_
print("Predicted Label:", predicted_label)
print(mapping)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 43ms/step Predicted Label: [[0.07718909 0.04763801 0.875173 ]] {'Negative': 0, 'Neutral': 1, 'Positive': 2}
import pickle
file_name = "Sentiment_Analysis_Climate_Change_Logistic_Regression.pickle"
pickle.dump(model1_lr, open(file_name, "wb"))
import pickle
file_name = "Sentiment_Analysis_Climate_Change_Neural_Network_upsampling.pickle"
pickle.dump(model3, open(file_name, "wb"))
If we pollute the air, water and soil that keep us alive and well, and destroy the biodiversity that allows natural systems to function, no amount of money will save us.
David Suzuki
Climate change is destroying our path to sustainability. Ours is a world of looming challenges and increasingly limited resources. Sustainable development offers the best chance to adjust our course.
Ban Ki-moon